In [58]:
import pandas as pd
import numpy as np
from scipy import sparse

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction import text
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, roc_auc_score

import pickle

## Raw data pre-processing
- load the raw data
- rename columns
- remove the ID column
- drop incomplete entries
- split the dataset into training and test sets

In [5]:
df = pd.read_csv('AO_df_15Feb_withunimajor.csv')

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,Uni_name,Uni_Major,Uni_Jobs,Uni_Course_lectureres,Uni_Facilities,Uni_Local_life,Uni_Societies_sports
0,0,University of Leeds,Accounting and Finance BSc (Hons),2.0,1.0,0.0,4.0,4.0
1,1,University of Leeds,Geography BA (Hons),3.0,2.0,1.0,3.0,0.0
2,2,University of Leeds,Chemical Engineering MEng (Hons),4.0,4.0,0.0,0.0,1.0
3,3,University of Leeds,Genetics BSc (Hons),4.0,3.0,4.0,4.0,0.0
4,4,University of Leeds,Art and Design BA (Hons),2.0,1.0,3.0,1.0,3.0


In [7]:
df = df.rename(
    columns={
        'Uni_name':              'uni_name', 
        'Uni_Major':             'uni_major',
        'Uni_Jobs':              'job_prospects', 
        'Uni_Course_lectureres': 'course_lectures',
        'Uni_Facilities':        'facilities',
        'Uni_Local_life':        'local_life', 
        'Uni_Societies_sports':  'societies_and_sports'
        }
    )
df.head()

Unnamed: 0.1,Unnamed: 0,uni_name,uni_major,job_prospects,course_lectures,facilities,local_life,societies_and_sports
0,0,University of Leeds,Accounting and Finance BSc (Hons),2.0,1.0,0.0,4.0,4.0
1,1,University of Leeds,Geography BA (Hons),3.0,2.0,1.0,3.0,0.0
2,2,University of Leeds,Chemical Engineering MEng (Hons),4.0,4.0,0.0,0.0,1.0
3,3,University of Leeds,Genetics BSc (Hons),4.0,3.0,4.0,4.0,0.0
4,4,University of Leeds,Art and Design BA (Hons),2.0,1.0,3.0,1.0,3.0


In [8]:
df.drop(columns = 'Unnamed: 0', inplace=True)

In [9]:
df.head()

Unnamed: 0,uni_name,uni_major,job_prospects,course_lectures,facilities,local_life,societies_and_sports
0,University of Leeds,Accounting and Finance BSc (Hons),2.0,1.0,0.0,4.0,4.0
1,University of Leeds,Geography BA (Hons),3.0,2.0,1.0,3.0,0.0
2,University of Leeds,Chemical Engineering MEng (Hons),4.0,4.0,0.0,0.0,1.0
3,University of Leeds,Genetics BSc (Hons),4.0,3.0,4.0,4.0,0.0
4,University of Leeds,Art and Design BA (Hons),2.0,1.0,3.0,1.0,3.0


In [10]:
clean_data = df.copy().dropna()
clean_data.head()

Unnamed: 0,uni_name,uni_major,job_prospects,course_lectures,facilities,local_life,societies_and_sports
0,University of Leeds,Accounting and Finance BSc (Hons),2.0,1.0,0.0,4.0,4.0
1,University of Leeds,Geography BA (Hons),3.0,2.0,1.0,3.0,0.0
2,University of Leeds,Chemical Engineering MEng (Hons),4.0,4.0,0.0,0.0,1.0
3,University of Leeds,Genetics BSc (Hons),4.0,3.0,4.0,4.0,0.0
4,University of Leeds,Art and Design BA (Hons),2.0,1.0,3.0,1.0,3.0


In [13]:
# let's do some investigation of cleaned dataframe vs. old dataframe
print('before: ' + str(df.shape[0]))
print('afer:   ' + str(clean_data.shape[0]))

# let's see the null values per column
print()
for column in df:
    print(column + ": " + str(df[column].isna().sum()))

before: 3238
afer:   2397

uni_name: 0
uni_major: 841
job_prospects: 0
course_lectures: 0
facilities: 0
local_life: 0
societies_and_sports: 0


In [14]:
# how about numerical part - what are we working with?
print(clean_data.nunique())
clean_data.describe()

uni_name                 19
uni_major               852
job_prospects             5
course_lectures           5
facilities                5
local_life                5
societies_and_sports      5
dtype: int64


Unnamed: 0,job_prospects,course_lectures,facilities,local_life,societies_and_sports
count,2397.0,2397.0,2397.0,2397.0,2397.0
mean,2.264497,2.695035,2.554443,2.09053,1.981227
std,1.773755,1.640736,1.678966,1.839031,1.820647
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,0.0,0.0
50%,3.0,3.0,3.0,3.0,3.0
75%,4.0,4.0,4.0,4.0,4.0
max,4.0,4.0,4.0,4.0,4.0


In [15]:
# let's see the distribution of entries across unis
#print(df.groupby(by='uni_name')['uni_name'].count())
df['uni_name'].value_counts()

University of Edinburgh                        181
University of Manchester                       181
Lancaster University                           181
University of Warwick                          181
University of York                             181
University of Glasgow                          181
University of Leeds                            181
University of Southampton                      181
University of Bristol                          180
UCL (University College London)                180
University of Sheffield                        180
King's College London, University of London    180
Durham University                              170
University of Nottingham                       170
University of Aberdeen                         160
Queen Mary University of London                150
University of Cambridge                        150
Imperial College London                        140
University of Oxford                           130
Name: uni_name, dtype: int64

In [16]:
# ...and simiilar for majors
df.groupby(by='uni_major')['uni_major'].describe()

Unnamed: 0_level_0,count,unique,top,freq
uni_major,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Accountancy and Finance MA (Hons),1,1,Accountancy and Finance MA (Hons),1
Accountancy with International Accounting BAcc (Hons),1,1,Accountancy with International Accounting BAcc...,1
Accounting BSc (Hons),3,1,Accounting BSc (Hons),3
Accounting and Finance (Industry) BSc (Hons),1,1,Accounting and Finance (Industry) BSc (Hons),1
Accounting and Finance BA (Hons),1,1,Accounting and Finance BA (Hons),1
...,...,...,...,...
War Studies BA (Hons),3,1,War Studies BA (Hons),3
War Studies MA,2,1,War Studies MA,2
World History BA (Hons),3,1,World History BA (Hons),3
Zoology BSc (Hons),18,1,Zoology BSc (Hons),18


In [17]:
#X = clean_data
X = clean_data[['uni_major', 'job_prospects', 'course_lectures', 'facilities', 'local_life', 'societies_and_sports']]
X.head()

Unnamed: 0,uni_major,job_prospects,course_lectures,facilities,local_life,societies_and_sports
0,Accounting and Finance BSc (Hons),2.0,1.0,0.0,4.0,4.0
1,Geography BA (Hons),3.0,2.0,1.0,3.0,0.0
2,Chemical Engineering MEng (Hons),4.0,4.0,0.0,0.0,1.0
3,Genetics BSc (Hons),4.0,3.0,4.0,4.0,0.0
4,Art and Design BA (Hons),2.0,1.0,3.0,1.0,3.0


In [18]:
#y = clean_data.pop('uni_name')
y = clean_data['uni_name']
y.head()

0    University of Leeds
1    University of Leeds
2    University of Leeds
3    University of Leeds
4    University of Leeds
Name: uni_name, dtype: object

In [37]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=2018)

In [20]:
# Setting the vectorizer just like we would set a model
TfidfVec = TfidfVectorizer(
    # sublinear_tf=True,
    max_df=0.3,
    max_features=10000,
    norm='l2'
)

# Fitting the vectorizer on our training data
temp = TfidfVec.fit_transform(X['uni_major'])

fnames = TfidfVec.get_feature_names()

print(fnames)
print('num of features: ' + str(len(fnames)))
print(temp.shape)

['11', '18', 'abroad', 'accountancy', 'accounting', 'actuarial', 'administration', 'adult', 'advance', 'advanced', 'advertising', 'aerodynamics', 'aeronautical', 'aeronautics', 'aerospace', 'affairs', 'american', 'an', 'analysis', 'analytics', 'ancient', 'and', 'anglo', 'animation', 'anthropology', 'applied', 'arabic', 'archaeology', 'architectural', 'architecture', 'art', 'artificial', 'arts', 'asian', 'astronautics', 'astronomy', 'astrophysics', 'at', 'automotive', 'ba', 'bacc', 'baecom', 'banking', 'barch', 'basc', 'based', 'bass', 'bds', 'behavioural', 'beng', 'bfa', 'big', 'biochemical', 'biochemistry', 'bioengineering', 'biological', 'biology', 'biomedical', 'biomedicine', 'bioprocessing', 'biosciences', 'biotechnology', 'bmbch', 'bmbs', 'bmid', 'bmus', 'bnurs', 'brain', 'bristol', 'bs', 'bsocsc', 'business', 'bvms', 'by', 'canadian', 'cardiac', 'cdt', 'cell', 'celtic', 'certificate', 'chain', 'change', 'chb', 'chemical', 'chemistry', 'child', 'children', 'chinese', 'city', 'civi

In [24]:
my_stop_words = text.ENGLISH_STOP_WORDS.union(['or', 'year', 'years', 'yr', 'yrs', 'mmath'])

In [38]:
TfidfVec = TfidfVectorizer(
    # sublinear_tf=True,
    stop_words=my_stop_words,
    max_df=0.3,
    max_features=10000,
    norm='l2'
)

# Fitting the vectorizer on our training data
temp = TfidfVec.fit_transform(X['uni_major'])

fnames = TfidfVec.get_feature_names()
print(fnames)
print('num of features: ' + str(len(fnames)))
print(temp.shape)

['11', '18', 'abroad', 'accountancy', 'accounting', 'actuarial', 'administration', 'adult', 'advance', 'advanced', 'advertising', 'aerodynamics', 'aeronautical', 'aeronautics', 'aerospace', 'affairs', 'american', 'analysis', 'analytics', 'ancient', 'anglo', 'animation', 'anthropology', 'applied', 'arabic', 'archaeology', 'architectural', 'architecture', 'art', 'artificial', 'arts', 'asian', 'astronautics', 'astronomy', 'astrophysics', 'automotive', 'ba', 'bacc', 'baecom', 'banking', 'barch', 'basc', 'based', 'bass', 'bds', 'behavioural', 'beng', 'bfa', 'big', 'biochemical', 'biochemistry', 'bioengineering', 'biological', 'biology', 'biomedical', 'biomedicine', 'bioprocessing', 'biosciences', 'biotechnology', 'bmbch', 'bmbs', 'bmid', 'bmus', 'bnurs', 'brain', 'bristol', 'bs', 'bsocsc', 'business', 'bvms', 'canadian', 'cardiac', 'cdt', 'cell', 'celtic', 'certificate', 'chain', 'change', 'chb', 'chemical', 'chemistry', 'child', 'children', 'chinese', 'city', 'civil', 'civilisation', 'clas

In [39]:
preprocessor = ColumnTransformer([
    ('corpus', TfidfVec, 'uni_major'), 
    ('numerical', StandardScaler(), 
         ['job_prospects', 'course_lectures', 'facilities', 'local_life', 'societies_and_sports'])
    ], 
    remainder = 'drop', # treatment for remaining columns
    sparse_threshold=1) # set threshold for fraction of nonzero entries 
                        # to return dense array rather than sparse matrix

In [40]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(penalty='none', multi_class='ovr', solver='lbfgs', max_iter=10000))])

In [45]:
# let's try to ignore the uni_major entirely
# (including the dropped entries)
temp_data = df.copy()
X1 = temp_data[['job_prospects', 'course_lectures', 'facilities', 'local_life', 'societies_and_sports']]
y1 = temp_data['uni_name']

X1_train, X1_test, y1_train, y1_test = train_test_split(
    X1, y1, test_size=0.2, stratify=y1, random_state=2018)
X1.head()
y1.head()
model = LogisticRegression(solver='lbfgs', max_iter=10000).fit(X1_train, y1_train)
model.score(X1_test, y1_test)

0.08333333333333333

In [42]:
y_train

336        University of Bristol
536      Imperial College London
1892        University of Oxford
1902        University of Oxford
2730      University of Aberdeen
                  ...           
102          University of Leeds
1490       University of Glasgow
2464    University of Manchester
1132        Lancaster University
1316    University of Nottingham
Name: uni_name, Length: 1917, dtype: object

In [46]:
#clf.fit(X_train, y_train)
clf.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(sparse_threshold=1,
                                   transformers=[('corpus',
                                                  TfidfVectorizer(max_df=0.3,
                                                                  max_features=10000,
                                                                  stop_words=frozenset({'a',
                                                                                        'about',
                                                                                        'above',
                                                                                        'across',
                                                                                        'after',
                                                                                        'afterwards',
                                                                                        'again',
       

In [47]:
#cross_val_score(clf, X_train, y_train, cv=5).mean()
cross_val_score(clf, X_train, y_train, cv=5)

array([0.37239583, 0.34635417, 0.40469974, 0.39947781, 0.36553525])

In [48]:
clf.score(X_test, y_test)

0.4270833333333333

In [56]:
predictions = clf.predict(X_test)
confusion = confusion_matrix(y_test, predictions)
pd.DataFrame(confusion, columns=sorted(y_train.unique()),
             index=sorted(y_train.unique()))

Unnamed: 0,Durham University,Imperial College London,"King's College London, University of London",Lancaster University,Queen Mary University of London,UCL (University College London),University of Aberdeen,University of Bristol,University of Cambridge,University of Edinburgh,University of Glasgow,University of Leeds,University of Manchester,University of Nottingham,University of Oxford,University of Sheffield,University of Southampton,University of Warwick,University of York
Durham University,14,1,2,0,0,1,1,0,0,1,0,1,0,2,2,0,3,1,1
Imperial College London,2,15,0,0,0,0,2,0,0,0,0,0,0,1,3,0,2,0,0
"King's College London, University of London",2,0,13,1,0,0,0,0,2,0,1,1,0,0,1,1,0,1,0
Lancaster University,2,1,1,12,1,1,0,0,1,3,1,0,1,0,2,1,1,3,2
Queen Mary University of London,0,0,0,1,9,1,0,0,2,0,0,0,1,3,1,1,2,1,2
UCL (University College London),2,1,2,1,0,5,1,0,1,1,0,1,2,1,0,0,2,1,0
University of Aberdeen,0,1,1,0,0,0,11,0,0,3,1,0,1,0,0,2,1,1,0
University of Bristol,0,1,0,0,0,0,1,2,0,0,0,0,0,1,0,0,0,0,1
University of Cambridge,0,0,1,1,0,0,0,0,18,0,0,0,0,1,0,1,0,0,0
University of Edinburgh,1,0,0,1,0,0,7,1,0,11,2,0,0,1,0,1,1,0,0


In [60]:
print(classification_report(y_test, clf.predict(X_test)))

                                             precision    recall  f1-score   support

                          Durham University       0.37      0.47      0.41        30
                    Imperial College London       0.50      0.60      0.55        25
King's College London, University of London       0.54      0.57      0.55        23
                       Lancaster University       0.44      0.36      0.40        33
            Queen Mary University of London       0.41      0.38      0.39        24
            UCL (University College London)       0.50      0.24      0.32        21
                     University of Aberdeen       0.37      0.50      0.42        22
                      University of Bristol       0.40      0.33      0.36         6
                    University of Cambridge       0.58      0.82      0.68        22
                    University of Edinburgh       0.44      0.42      0.43        26
                      University of Glasgow       0.59      0.39

## Observations
- This model is okay at predicting the university. It is better than baseline by about 30-40% on average. 
- Both precision and recall scores are low for University of Warwick. This means the model frequently incorrectly labels other universities as University of Warwick, which will be something to look out for. 
- Recall scores are low for both UCL and University of Leeds, meaning that these universities are not frequently correctly identified in the model.
- University of York and University of Southhampton were often mistaken as University of Warwick. 
- University of Edinburgh and University of Aberdeen were often mixed up (both are in Scotland). Interestingly, University of Glasgow was not mixed up with these two universities. 

In [62]:
pickle.dump(clf, open('models/uni_wizz_v0.pkl', 'wb'))

In [64]:
tmm = pickle.load(open('models/uni_wizz_v0.pkl', 'rb')) 

In [67]:
#Test out the predicotr, print(type(tmm.predict(X_test.iloc[1,:])))
raw_features = {"uni_major" : "psychology", 
                "job_prospects" : ,
                "course_lectures" : 3,
                "facilities" : 2,
                "local_life" : 1,
                "societies_and_sports" : 5}
final_features = pd.DataFrame([raw_features.values()], columns=raw_features.keys())
print(final_features)

print(str(tmm.predict(final_features)[0]))

    uni_major  job_prospects  course_lectures  facilities  local_life  \
0  psychology              1                3           2           1   

   societies_and_sports  
0                     5  
Queen Mary University of London


In [49]:
y_train_pred = clf.predict(X_train)
y_pred = clf.predict(X_test)

In [51]:
y_train_pred

array(['University of Bristol', 'Durham University',
       'University of Oxford', ..., 'University of Manchester',
       'Lancaster University', 'University of Nottingham'], dtype=object)

In [80]:
X_train.iloc[2, :]

Uni_Major               Physics (4-year MPhys) MPhys
job_prospects                                      0
course_lectures                                    0
Facilities                                         4
local_life                                         4
societies_and_sports                               0
Name: 1892, dtype: object

In [91]:
y

0                               University of Leeds
1                               University of Leeds
2                               University of Leeds
3                               University of Leeds
4                               University of Leeds
                           ...                     
3227    King's College London, University of London
3230    King's College London, University of London
3231    King's College London, University of London
3233    King's College London, University of London
3236    King's College London, University of London
Name: uni_name, Length: 2397, dtype: object

In [86]:
X_test

Unnamed: 0,Uni_Major,job_prospects,course_lectures,Facilities,local_life,societies_and_sports
2120,Human Geography and Environment BA (Hons),0.0,0.0,0.0,0.0,0.0
2453,Economics and Politics BAECOM,3.0,4.0,4.0,4.0,3.0
2850,Chemical Engineering BEng (Hons),0.0,4.0,4.0,3.0,0.0
2122,Politics with International Relations BA (Hons),4.0,3.0,4.0,0.0,4.0
1871,Marine Biology with Oceanography BSc (Hons),4.0,0.0,0.0,4.0,4.0
...,...,...,...,...,...,...
2626,Biological Sciences BSc (Hons),4.0,4.0,2.0,4.0,4.0
183,MASt in Astrophysics,4.0,4.0,4.0,4.0,3.0
2588,English Literature BA (Hons),4.0,4.0,4.0,0.0,0.0
2787,Law LLB (Hons),4.0,4.0,0.0,3.0,3.0


In [99]:
X_test.iloc[2, :]

Uni_Major               Chemical Engineering BEng (Hons)
job_prospects                                          0
course_lectures                                        4
Facilities                                             4
local_life                                             3
societies_and_sports                                   0
Name: 2850, dtype: object

In [101]:
dataframe = pd.DataFrame(X_test.iloc[2, :])   

In [132]:
dataframe_reshaped = dataframe.transpose( copy=False)

In [133]:
clf.predict(dataframe_reshaped)

array(['University of Manchester'], dtype=object)

In [83]:
X_test

Unnamed: 0,Uni_Major,job_prospects,course_lectures,Facilities,local_life,societies_and_sports
2120,Human Geography and Environment BA (Hons),0.0,0.0,0.0,0.0,0.0
2453,Economics and Politics BAECOM,3.0,4.0,4.0,4.0,3.0
2850,Chemical Engineering BEng (Hons),0.0,4.0,4.0,3.0,0.0
2122,Politics with International Relations BA (Hons),4.0,3.0,4.0,0.0,4.0
1871,Marine Biology with Oceanography BSc (Hons),4.0,0.0,0.0,4.0,4.0
...,...,...,...,...,...,...
2626,Biological Sciences BSc (Hons),4.0,4.0,2.0,4.0,4.0
183,MASt in Astrophysics,4.0,4.0,4.0,4.0,3.0
2588,English Literature BA (Hons),4.0,4.0,4.0,0.0,0.0
2787,Law LLB (Hons),4.0,4.0,0.0,3.0,3.0
