In [1]:
import gensim
import pandas as pd
from eric_chen_forward import util

In [2]:
df = pd.read_csv('new_dataset.csv')
df['cleaned_text'] = df['paragraph'].apply(lambda x: util.clean_document(x))
df['tokenized'] = df['cleaned_text'].apply(lambda x: x.split())
df.head()

Unnamed: 0,label,paragraph,cleaned_text,tokenized
0,Academics,Associate Professor Song has expanded the Busi...,associate professor song expanded business int...,"[associate, professor, song, expanded, busines..."
1,Academics,MSLIS student and ALA Spectrum Scholar Reynoso...,mslis student ala spectrum scholar reynoso dra...,"[mslis, student, ala, spectrum, scholar, reyno..."
2,Academics,Professor Downie gave the keynote for the 2023...,professor downie gave keynote acm sigir confer...,"[professor, downie, gave, keynote, acm, sigir,..."
3,Academics,COURSE LOADS AND OVERLOADS\nThe normal course ...,course load overload normal course load varies...,"[course, load, overload, normal, course, load,..."
4,Academics,The maximum number of hours a student can take...,maximum number hour student take given term de...,"[maximum, number, hour, student, take, given, ..."


In [3]:
model = gensim.models.Word2Vec(
    window=5,
    min_count=2,
    workers=4,
)

In [4]:
text = df['tokenized']

In [5]:
text.loc[0]

['associate',
 'professor',
 'song',
 'expanded',
 'business',
 'intelligence',
 'group',
 'student',
 'consultancy',
 'group',
 'associated']

In [6]:
model.build_vocab(text, progress_per=1000)

In [7]:
model.epochs

5

In [8]:
model.corpus_count

2467

In [9]:
model.train(text, total_examples=model.corpus_count, epochs=model.epochs)

(441167, 493985)

In [10]:
model.save("w2v.model")

In [11]:
len(model.wv)

6237

In [12]:
model.wv.most_similar("college")

[('school', 0.9995893836021423),
 ('education', 0.9992655515670776),
 ('business', 0.9992366433143616),
 ('graduate', 0.9990893602371216),
 ('liberal', 0.9990546703338623),
 ('mathematics', 0.9989520311355591),
 ('earn', 0.9988529682159424),
 ('phd', 0.998820424079895),
 ('curriculum', 0.9987074136734009),
 ('additional', 0.9986926913261414)]

In [13]:
model.wv.most_similar("computer")

[('science', 0.9969636797904968),
 ('bachelor', 0.9964192509651184),
 ('degree', 0.9933509230613708),
 ('b', 0.9929066896438599),
 ('engineering', 0.9886715412139893),
 ('psychology', 0.9879353642463684),
 ('electrical', 0.9875955581665039),
 ('joint', 0.9868075251579285),
 ('specialization', 0.9857147336006165),
 ('art', 0.9848703145980835)]

In [14]:
model.wv.similarity('college', 'school')

0.9995894

In [15]:
model.wv.get_mean_vector(text.loc[0])

array([-0.07878568,  0.12007365,  0.08206153,  0.00452788,  0.01928951,
       -0.21224527,  0.06692049,  0.2773162 , -0.12414341, -0.01793329,
        0.0101476 , -0.20531438,  0.00960027,  0.03292124,  0.03104411,
       -0.16742165,  0.01701005, -0.14567567,  0.03178149, -0.2574174 ,
        0.06980379,  0.08707588, -0.02720303, -0.06010542, -0.04038551,
       -0.03431287, -0.04606346, -0.07700709,  0.02048071,  0.01197418,
        0.21116878,  0.03144749, -0.04558324, -0.02473793, -0.05907683,
        0.14991055, -0.00161541, -0.0881497 , -0.03490939, -0.23027214,
        0.14745016, -0.1433695 , -0.07294632,  0.05636094,  0.04947818,
       -0.07458551,  0.01840388, -0.00400996,  0.02164999,  0.0675102 ,
       -0.00798365, -0.17749128, -0.04656086, -0.01476886, -0.08945723,
        0.05352692,  0.10045405,  0.00405218, -0.08974165,  0.01484045,
        0.06696312, -0.02525364,  0.00603367, -0.00031035, -0.10100944,
        0.02100647,  0.11614598,  0.04815542, -0.17684355,  0.11

In [16]:
df['vector'] = df['tokenized'].apply(lambda x: model.wv.get_mean_vector(x))
df.head()

Unnamed: 0,label,paragraph,cleaned_text,tokenized,vector
0,Academics,Associate Professor Song has expanded the Busi...,associate professor song expanded business int...,"[associate, professor, song, expanded, busines...","[-0.07878568, 0.12007365, 0.08206153, 0.004527..."
1,Academics,MSLIS student and ALA Spectrum Scholar Reynoso...,mslis student ala spectrum scholar reynoso dra...,"[mslis, student, ala, spectrum, scholar, reyno...","[-0.08084031, 0.13093157, 0.076495126, -0.0011..."
2,Academics,Professor Downie gave the keynote for the 2023...,professor downie gave keynote acm sigir confer...,"[professor, downie, gave, keynote, acm, sigir,...","[-0.07699328, 0.1190415, 0.08183074, 0.0030425..."
3,Academics,COURSE LOADS AND OVERLOADS\nThe normal course ...,course load overload normal course load varies...,"[course, load, overload, normal, course, load,...","[-0.07635715, 0.1196787, 0.08338048, 0.0025621..."
4,Academics,The maximum number of hours a student can take...,maximum number hour student take given term de...,"[maximum, number, hour, student, take, given, ...","[-0.0763707, 0.12011519, 0.07883641, -0.000130..."


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier

import numpy as np
import pickle

In [19]:
X = df['vector']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0, stratify=y)

In [20]:
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [21]:
sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=0, tol=None)
sgd.fit(X_train_2d, y_train)
y_pred = sgd.predict(X_test_2d)

print(f'accuracy: {accuracy_score(y_pred, y_test)}')
print(classification_report(y_test, y_pred))

accuracy: 0.5591572123176661
              precision    recall  f1-score   support

   Academics       0.53      1.00      0.69       224
      Alumni       0.62      0.76      0.68       152
      Campus       0.00      0.00      0.00        69
     History       0.00      0.00      0.00        50
Student Life       0.88      0.06      0.11       122

    accuracy                           0.56       617
   macro avg       0.40      0.36      0.30       617
weighted avg       0.52      0.56      0.44       617



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
lr = LogisticRegression(C=1e5)
lr.fit(X_train_2d, y_train)
y_pred = lr.predict(X_test_2d)

print(f'accuracy: {accuracy_score(y_pred, y_test)}')
print(classification_report(y_test, y_pred))

accuracy: 0.807131280388979
              precision    recall  f1-score   support

   Academics       0.89      0.92      0.91       224
      Alumni       0.84      0.86      0.85       152
      Campus       0.54      0.62      0.58        69
     History       0.58      0.30      0.39        50
Student Life       0.84      0.84      0.84       122

    accuracy                           0.81       617
   macro avg       0.74      0.71      0.71       617
weighted avg       0.80      0.81      0.80       617



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
gbc = GradientBoostingClassifier()
gbc.fit(X_train_2d, y_train)
y_pred = gbc.predict(X_test_2d)

print(f'accuracy: {accuracy_score(y_pred, y_test)}')
print(classification_report(y_test, y_pred))

accuracy: 0.8038897893030794
              precision    recall  f1-score   support

   Academics       0.88      0.93      0.90       224
      Alumni       0.86      0.86      0.86       152
      Campus       0.53      0.52      0.53        69
     History       0.62      0.50      0.56        50
Student Life       0.80      0.79      0.79       122

    accuracy                           0.80       617
   macro avg       0.74      0.72      0.73       617
weighted avg       0.80      0.80      0.80       617



In [24]:
with open('w2v_model.pkl', 'wb') as f:
    pickle.dump(gbc, f)

In [28]:
test_passage = "Illinois has tremendous breadth and depth in academics, with more than 150 undergraduate and more than 100 graduate and professional programs."

In [29]:
cleaned_text = util.clean_document(test_passage).split()
vector = model.wv.get_mean_vector(cleaned_text).reshape(1, -1)

gbc.predict(vector)[0]

'Academics'