In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import scipy
from sklearn import neighbors
from sklearn.model_selection import cross_val_score

#### Challenge: Model Comparison

You now know two kinds of regression and two kinds of classifier. So let's use that to compare models!

Comparing models is something data scientists do all the time. There's very rarely just one model that would be possible to run for a given situation, so learning to choose the best one is very important.

Here let's work on regression. Find a data set and build a KNN Regression and an OLS regression. Compare the two. How similar are they? Do they miss in different ways?

Create a Jupyter notebook with your models. At the end in a markdown cell write a few paragraphs to describe the models' behaviors and why you favor one model or the other. Try to determine whether there is a situation where you would change your mind, or whether one is unambiguously better than the other. Lastly, try to note what it is about the data that causes the better model to outperform the weaker model. Submit a link to your notebook below.

#### Description of ‘A-level-geography’

The data contain the result of examination on A-level geometry for 33,276 students from over 2,000 institutions in England in 1997. There are 15 fields in the data set of ASCII format, and each field is separated by a blank space. The detailed description of the fields is as followings


| Variable  | Coding | Description |
| --------- | ------ | ----------- |
| SCORE   |   0, 2, 4, 6, 8, 10   | 0=fail, 2=grade E, 4=grade D, 6=grade C, 8=grade B, 10=grade A | 
| BOARD  | 1 – 7  | 1=Associate and WJB, 2=Cambridge, 3=London, 5=Oxforld, 6=Joint Matriculation, 7=Oxford-Cambridge |
| GCSE-G-SCORE | 0,2,3,4,5,6,7,8 | 0=fail, 2=grade F,  3=grade E, 4=grade D, 5=grade C, 6=grade B, 7=grade A, 8=grade A* | 
| GENDER | 0 or 1 | 0=Male, 1=Female | 			
| GTOT | 19 ~ 95 continuous | 	Total point score of all GCSE subjects
| GNUM | 4 ~13 continuous | 	Total number of GCSE taken 
| GCSE-MA-MAX | 0 – 8  | Maximum point score for GCSE math: 0=fail, 2=grade F,  3=grade E, 4=grade D, 5=grade C, 6=grade B, 7=grade A, 8=grade A* | 
| GCSE-math-n | 	1,2,3,4 | 	Total number of GCSE math subjects taken | 
| AGE | 	continuous | 	Age of student in month, centred at 222 months ( 18.5 years) | 
| INST-GA-MN | 	continuous | 	Institution average of GCSE score, centred at its mean | 
| INST-GA-SD | 	continuous | 	Institution standard deviation of GCSE score | 
| INSTTYPE	Category | 1 ~ 11 |1 = LEA Maintained Comprehensive, 2 = Maintained Selective, 3 = Maintained Modern, 4 = Grammar Comprehensive, 5 = Grammar Selective, 6 = Grammar Modern, 7 = Independent selective, 8 = Independent non-selective, 9 = Sixth Form College, 10 = Further Education College, 11 = Others | 
| LEA | 	1 ~ 131 | 	Local Education Authority identification | 
| INSTITUTE | 	1 ~ 98 | 	Institution identification within LEA | 
| STUDENT | 	25 ~ 196053 | 	Student identification | 

In [2]:
df = pd.read_csv('data/geography.txt', sep=' ', header=None) 
df.columns = ['a_scre','boards', 'g_ge_s', 'gender', 'g_tl_s', 'g_tl_n','g_m_mx', 'g_m_tl','age_mh',
              'i_g_mn', 'i_g_sd','i_type', 'lea_id', 'ise_id', 'studnt' ]
df.tail()

Unnamed: 0,a_scre,boards,g_ge_s,gender,g_tl_s,g_tl_n,g_m_mx,g_m_tl,age_mh,i_g_mn,i_g_sd,i_type,lea_id,ise_id,studnt
33271,8.0,3,7,1,71,11.0,7,1,-3.0,-0.06,0.65,9.0,131.0,33.0,196035.0
33272,6.0,3,6,0,52,9.0,6,1,1.0,-0.06,0.65,9.0,131.0,33.0,196037.0
33273,4.0,3,5,1,46,9.0,5,1,-3.0,-0.06,0.65,9.0,131.0,33.0,196039.0
33274,8.0,3,5,1,52,9.0,5,1,5.0,-0.06,0.65,9.0,131.0,33.0,196047.0
33275,6.0,3,6,1,48,9.0,5,1,-3.0,-0.06,0.65,9.0,131.0,33.0,196053.0


In [None]:
#  {'col1':{'lbl':'col1', 'fnn':'ABC', 'kwg': {}}, 'col2':{}, }

In [3]:
drops1 = ['i_g_sd', 'studnt']
drops2 = ['lea_id', 'g_tl_s', 'g_tl_n', 'i_ga_mn', 'i_ga_sd']
df = df.drop(drops1, axis=1)

In [4]:
a = ['passed']                                                             # new ... binary target 1 ==> passing score
b = ['g_avg_']                                                             # new ... students average score on gsce math exams
c = ['ise_id']                                                             # create unique institue id
d = ['age_mh']                                                             # tranform months to continous
e = ['board', 'ise_id', 'i_type', 'g_m_tl']                                # rank feature based upon target 
f = ['board', 'ise_id', 'i_type', 'g_ge_s', 'g_m_mx', 'age_mh', 'g_m_tl']  # linear scale features -2 to 2
g = ['g_avg_']                                                             # standardize feature with z score

bg  =  list(set(b) & set(g))
cef =  list(set(c) & set(e) & set(f))
df  =  list(set(d) & set(f)) 
ef  =  list(set(e) & set(f) - set(cef)) 
f   =  list(set(f) - set(df) - set(ef) - set(cef))

groups = ['bg', 'cef', 'df', 'ef', 'f', 'a']


In [35]:
# functions to clean or tranform columns
def A(df, fte, tgt):
    srs = np.where(df[tgt] > 0, 1, 0)
    return df
def B(df, dict):
    df[dict['fte']] = df[dict['cl1']].div(df[dict['cl2']])
    return df    
def C():  # standardize 
    z_score = lambda x: (x-x.mean())/x.std()
    return S.transform(z_score )    

'''
def A(S): # tranform to numeric range step 1 for unique values
    dct = {S.unique()[i]:i for i in range(len(S.unique()))}
    return S.apply(lambda x: dct[x])
def B(S):  # standardize 
    z_score = lambda x: (x-x.mean())/x.std()
    return S.transform(z_score )
def C(S):  # scale to approx range of -2 to 2
    rng = x.max()-x.min()
    scale = lambda x: (x- rng/ 2) * 4 / rng
    return S.transform(scale)
def Z(S):  # dummy function
    return S
'''
switch = {char:eval(char) for char in 'AB'}

def switch_clean(df, *dict):
    for key in dict.keys()
        for f in dict[key]['fnn']:
            
    
        df[dict['fte']] = switch[f](df,dict)               
    return df'''

"def switch_clean(df, *dict):\n    for key in dict.keys()\n        for f in dict[key]['fnn']:\n            \n    \n        df[dict['fte']] = switch[f](df,dict)               \n    return df"

In [36]:
switch['A'](df, lbl, fnn, kwg )

TypeError: A() got an unexpected keyword argument 'fnn'

In [31]:
for key in clean.keys():
    for f in clean[key]['fnn']:
        print(f)

B
G
C
E
F
D
F
E
F
E
F
E
F
F
F
A


In [23]:
dx = switch_clean(df, clean['passed'])

TypeError: list indices must be integers or slices, not str

In [18]:
clean['passed']['fte']

'passed'

In [19]:
df[passed]

NameError: name 'passed' is not defined

In [7]:
# create dict "clean" with all parameters for switch_clean()
dict1 = {val:str.upper(sub) for sub in groups for val in eval(sub)}
clean = {key:{'fte':key,'fnn':dict1[key], 'tgt':'a_scre'} for key in dict1.keys()}
clean['passed']['cl1'] = 'a_scre'
clean['g_avg_']['cl1'] = 'g_tl_s'
clean['g_avg_']['cl2'] = 'g_tl_n'

In [None]:
# create a binary target for passing score
df['passd'] = np.where(df.score > 0, 1, 0)
# column tranformations to model features
 
df.age = df.age + 222
df['inst_id'] = df.lea * 100 + df.institute
# to rank feature values to be scaled low to high by 'score'
rnk_brd = df.groupby('board').score.mean().sort_values(ascending=True).index
rnk_ite = df.groupby('inst_type').score.mean().sort_values(ascending=True).index
rnk_iid = df.groupby('inst_id').score.mean().sort_values(ascending=True).index
df.board = df.board.apply(lambda x : {rank_brd[i]:i for i in range(len(rank_brd))}[x])
# drop columns net needed

df = df.drop(drops,axis=1)
df.tail()

In [None]:
plt.hist(df.g_m_mx)
plt.show()

In [None]:


rnk_brd = df.groupby('board').score.mean().sort_values(ascending=True).index
rnk_ite = df.groupby('inst_type').score.mean().sort_values(ascending=True).index
rnk_iid = df.groupby('inst_id').score.mean().sort_values(ascending=True).index
df.board = df.board.apply(lambda x : {rnk_brd[i]:i for i in range(len(rnk_brd))}[x])

In [None]:
def rank_by_target(df, tgt, fte):
    rnk = df.groupby(fte)[tgt].mean().sort_values(ascending=True).index
    dct = {rnk[i]:i for i in range(len(rnk))}
    df[fte] = df[fte].apply()
    

In [None]:
X = pd.DataFrame(dct2)
Y_score = X.score
Y_passd = X.passd
X = X.drop(['score', 'passd'], axis=1)

In [None]:
Y_passd.head()

In [None]:
X.head()

In [None]:
knn_score = neighbors.KNeighborsRegressor(n_neighbors=5, weights='distance')
knn_passd = neighbors.KNeighborsRegressor(n_neighbors=5, weights='distance')
knn_score.fit(X, Y_score)
knn_passd.fit(X, Y_passd)

In [None]:
acc_score = cross_val_score(knn_score, X, Y_score, cv=5)
acc_passd = cross_val_score(knn_passd, X, Y_passd, cv=5)
print("Predict Score Accuracy: %0.2f (+/- %0.4f)" % (acc_score.mean(), acc_score.std() ** 2))
print("Predict passd Accuracy: %0.2f (+/- %0.4f)" % (acc_passd.mean(),  acc_passd.std()  ** 2))


In [None]:
acc_passd

In [None]:
param = [(x,y ) for x in range(1,11) for y in ['distance', 'uniform']]

In [None]:
results = []
for a,b in param:  
    knn1 = neighbors.KNeighborsRegressor(n_neighbors=a, weights=b)
    X = music[['loudness', 'duration']]
    Y = music.bpm
    knn.fit(X, Y)
    score = cross_val_score(knn1, X, Y, cv=3)
    if b == 'distance': weighted = True
    else:               weighted = False
    results.append({'n':a,'weighted':weighted,'accuracy':np.round(score.mean(), decimals=2),
                    'std':np.round(score.std(), decimals=2)})
 

In [None]:
dfr = pd.concat([pd.Series(dct) for dct in results], axis=1, sort=False).T   
  

In [None]:
table = dfr.pivot(index='n', columns='weighted', values=['accuracy', 'std'])
table

In [None]:
regr = linear_model.LinearRegression()
regr.fit(X, y)

# Save predicted values.
Y_pred = regr.predict(X)
print('R-squared regression:', regr.score(X, y))

# Fit a linear model using Partial Least Squares Regression.
# Reduce feature space to 3 dimensions.
pls1 = PLSRegression(n_components=3)

# Reduce X to R(X) and regress on y.
pls1.fit(X, y)

# Save predicted values.
Y_PLS_pred = pls1.predict(X)