# University Rankings

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Data Setup

In [2]:
df = pd.read_csv("cwurData.csv")
df.head()

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
0,1,Harvard University,USA,1,7,9,1,1,1,1,,5,100.0,2012
1,2,Massachusetts Institute of Technology,USA,2,9,17,3,12,4,4,,1,91.67,2012
2,3,Stanford University,USA,3,17,11,5,4,2,2,,15,89.5,2012
3,4,University of Cambridge,United Kingdom,1,10,24,4,16,16,11,,50,86.17,2012
4,5,California Institute of Technology,USA,4,2,29,7,37,22,22,,18,85.21,2012


In [10]:
df[df['country']=='Turkey']

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
595,396,Middle East Technical University,Turkey,1,355,79,210,529,904,493,724.0,737,45.57,2014
801,602,Istanbul University,Turkey,2,255,337,210,681,580,609,554.0,737,44.81,2014
854,655,Hacettepe University,Turkey,3,355,478,210,636,636,406,554.0,737,44.71,2014
972,773,Istanbul Technical University,Turkey,4,355,388,210,641,858,363,769.0,637,44.56,2014
987,788,Ankara University,Turkey,5,355,461,210,751,723,493,739.0,737,44.54,2014
1007,808,Ege University,Turkey,6,355,478,210,734,822,406,724.0,737,44.52,2014
1055,856,Bilkent University,Turkey,7,355,478,210,788,782,609,783.0,737,44.44,2014
1099,900,Gazi University,Turkey,8,355,478,210,742,952,609,849.0,737,44.39,2014
1102,903,Boğaziçi University,Turkey,9,355,478,210,817,820,493,849.0,737,44.39,2014
1133,934,Dokuz Eylül University,Turkey,10,355,478,210,948,914,800,889.0,737,44.35,2014


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   world_rank            2200 non-null   int64  
 1   institution           2200 non-null   object 
 2   country               2200 non-null   object 
 3   national_rank         2200 non-null   int64  
 4   quality_of_education  2200 non-null   int64  
 5   alumni_employment     2200 non-null   int64  
 6   quality_of_faculty    2200 non-null   int64  
 7   publications          2200 non-null   int64  
 8   influence             2200 non-null   int64  
 9   citations             2200 non-null   int64  
 10  broad_impact          2000 non-null   float64
 11  patents               2200 non-null   int64  
 12  score                 2200 non-null   float64
 13  year                  2200 non-null   int64  
dtypes: float64(2), int64(10), object(2)
memory usage: 240.8+ KB


In [12]:
df.corr()

world_rank             -0.549098
national_rank          -0.199756
quality_of_education   -0.600541
alumni_employment      -0.510374
quality_of_faculty     -0.693540
publications           -0.522111
influence              -0.522837
citations              -0.522438
broad_impact           -0.531590
patents                -0.474810
score                   1.000000
year                   -0.239136
Name: score, dtype: float64

In [13]:
abs(df.corr()['score']).sort_values(ascending=False) 

score                   1.000000
quality_of_faculty      0.693540
quality_of_education    0.600541
world_rank              0.549098
broad_impact            0.531590
influence               0.522837
citations               0.522438
publications            0.522111
alumni_employment       0.510374
patents                 0.474810
year                    0.239136
national_rank           0.199756
Name: score, dtype: float64

In [14]:
df.shape

(2200, 14)

# Modelling

In [31]:
df = df.dropna()

In [32]:
X = df.drop(['score', 'year', 'institution'], axis=1)

In [33]:
y = df['score']

In [34]:
X = pd.get_dummies(X, drop_first=True)

In [35]:
from sklearn.linear_model import ElasticNet
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge,Lasso
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression #SKLEARN = Science Kit Learn
from sklearn.model_selection import train_test_split

In [36]:
def algo_test(x, y):
    
    L = LinearRegression()
    E = ElasticNet()
    R = Ridge()
    Lass = Lasso()
    ETR = ExtraTreeRegressor()
    GBR = GradientBoostingRegressor()
    XGBC = XGBRegressor()
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=13)
    
    algos = [L, E, R, Lass, ETR, GBR, XGBC]
    algo_names= ['Linear', 'ElasticNet', 'Ridge', 'Lasso', 'Extra Tree', 'Gradient Boosting', 'XGBRegressor']
    r_squared = []
    rmse = []
    mae = []
    
    result = pd.DataFrame(columns = ['R_Squared', 'RMSE', 'MAE'],
                           index = algo_names)
    
    for algo in algos:
        algo.fit(X_train, y_train)
        algo.predict(X_test)
        
        r_squared.append(r2_score(y_test, algo.predict(X_test)))
        rmse.append(mean_squared_error(y_test, algo.predict(X_test))**.5)
        mae.append(mean_absolute_error(y_test, algo.predict(X_test)))
        
    result.R_Squared = r_squared
    result.RMSE = rmse
    result.MAE = mae
    
    return result.sort_values('R_Squared', ascending=False)

In [37]:
algo_test(X, y)



Unnamed: 0,R_Squared,RMSE,MAE
Gradient Boosting,0.996761,0.407257,0.150194
Extra Tree,0.989791,0.722979,0.26455
XGBRegressor,0.988068,0.781625,0.233002
Ridge,0.616443,4.431577,2.187882
Linear,0.61566,4.436098,2.189643
ElasticNet,0.594327,4.557552,2.251085
Lasso,0.594228,4.558108,2.246459
