Skip to content
Binary file modified __pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q01_load_data/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q01_load_data/__pycache__/build.cpython-36.pyc
Binary file not shown.
14 changes: 13 additions & 1 deletion q01_load_data/build.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
# %load q01_load_data/build.py
import pandas as pd

# Write your code below


path = 'data/student-mat.csv'

def load_data(path):
df = pd.read_table(path, sep = ';')
#df.shape()
return df

#load_data(path)



Binary file modified q01_load_data/tests/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q01_load_data/tests/__pycache__/test.cpython-36.pyc
Binary file not shown.
Binary file modified q02_data_split/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q02_data_split/__pycache__/build.cpython-36.pyc
Binary file not shown.
12 changes: 11 additions & 1 deletion q02_data_split/build.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
# %load q02_data_split/build.py
from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split as tts
import pandas as pd
df = load_data('data/student-mat.csv')

# Write your code below
def split_dataset(df):
X = df.drop('G3',axis=1)
y = df['G3']
x_train, x_test, y_train, y_test = tts(X, y, test_size = 0.2)

return x_train, x_test, y_train, y_test

split_dataset(df)



Binary file modified q02_data_split/tests/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q02_data_split/tests/__pycache__/test.cpython-36.pyc
Binary file not shown.
Binary file modified q03_data_encoding/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q03_data_encoding/__pycache__/build.cpython-36.pyc
Binary file not shown.
14 changes: 14 additions & 0 deletions q03_data_encoding/build.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# %load q03_data_encoding/build.py
from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data
from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset
from sklearn.preprocessing import LabelEncoder
Expand All @@ -9,6 +10,19 @@

# Write your code below

def label_encode(X,X_test):

X_transform = x_train.apply(LabelEncoder().fit_transform)

X_test_transform = x_test.apply(LabelEncoder().fit_transform)


return X_transform, X_test_transform

label_encode(x_train,x_test)






Binary file modified q03_data_encoding/tests/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q03_data_encoding/tests/__pycache__/test.cpython-36.pyc
Binary file not shown.
Binary file modified q03_ohe_encoder/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q03_ohe_encoder/__pycache__/build.cpython-36.pyc
Binary file not shown.
21 changes: 18 additions & 3 deletions q03_ohe_encoder/build.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,34 @@
# %load q03_ohe_encoder/build.py
from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data
from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset
from greyatomlib.multivariate_regression_project.q03_data_encoding.build import label_encode
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

df = load_data('data/student-mat.csv')

x_train, x_test, y_train, y_test = split_dataset(df)
X_train, X_test, y_train, y_test = split_dataset(df)

category_index = [x for x in range(len(df.columns)) if df[df.columns[x]].dtype == 'object']


# Write your code below

def ohe_encode(X_train,X_test,category_index=category_index):
X_train,X_test=label_encode(X_train,X_test)

ohe = OneHotEncoder(categorical_features=category_index,sparse=False)
ohe.fit(X_train)
ohe.fit(X_test)

X_train = ohe.transform(X_train)
X_test = ohe.transform(X_test)

#rint (pd.DataFrame(X_train),pd.DataFrame(X_test))

return pd.DataFrame(X_train),pd.DataFrame(X_test)


#he_encode(X_train,X_test,category_index)



Binary file modified q03_ohe_encoder/tests/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q03_ohe_encoder/tests/__pycache__/test.cpython-36.pyc
Binary file not shown.
Binary file modified q04_data_visualisation/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q04_data_visualisation/__pycache__/build.cpython-36.pyc
Binary file not shown.
10 changes: 9 additions & 1 deletion q04_data_visualisation/build.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# -*- coding: utf-8 -*-
# %load q04_data_visualisation/build.py
from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data
from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset
from greyatomlib.multivariate_regression_project.q03_data_encoding.build import label_encode
Expand All @@ -10,7 +10,15 @@
x_train,x_test = label_encode(x_train,x_test)

# Write your code below

def visualise_data(data,figname):
return scatter_matrix(data)

# visualise_data(data,figname)







Binary file modified q04_data_visualisation/tests/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q04_data_visualisation/tests/__pycache__/test.cpython-36.pyc
Binary file not shown.
Binary file modified q05_linear_regression_model/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q05_linear_regression_model/__pycache__/build.cpython-36.pyc
Binary file not shown.
14 changes: 11 additions & 3 deletions q05_linear_regression_model/build.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# %load q05_linear_regression_model/build.py
from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data
from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset

Expand All @@ -7,9 +8,16 @@
df = load_data('data/student-mat.csv')

x_train, x_test, y_train, y_test = split_dataset(df)

x_train, x_test = label_encode(x_train,x_test)


# Write your code below


def linear_regression(X,y):
regressor = LinearRegression()
#print (regressor.fit(X,y))
return regressor.fit(X,y)

#linear_regression(X,y)



Binary file not shown.
Binary file modified q05_linear_regression_model/tests/__pycache__/test.cpython-36.pyc
Binary file not shown.
Binary file modified q06_cross_validation/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q06_cross_validation/__pycache__/build.cpython-36.pyc
Binary file not shown.
13 changes: 11 additions & 2 deletions q06_cross_validation/build.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# %load q06_cross_validation/build.py
from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data
from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset

Expand All @@ -12,10 +13,18 @@
df = load_data('data/student-mat.csv')

x_train, x_test, y_train, y_test = split_dataset(df)

x_train,x_test = label_encode(x_train,x_test)

model =linear_regression(x_train,y_train)

# Write your code below
def cross_validation_regressor(model, X,y):
r2_score = cross_val_score(model,X,y,scoring='r2')
# print (r2_score.mean())

return r2_score.mean()

# cross_validation_regressor(model, x_train,y_train)




Binary file modified q06_cross_validation/tests/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q06_cross_validation/tests/__pycache__/test.cpython-36.pyc
Binary file not shown.
Binary file modified q07_regression_pred/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q07_regression_pred/__pycache__/build.cpython-36.pyc
Binary file not shown.
14 changes: 13 additions & 1 deletion q07_regression_pred/build.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# %load q07_regression_pred/build.py

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

Expand All @@ -18,5 +19,16 @@

val = cross_validation_regressor(model,x_train,y_train)


# Write your code below

def regression_predictor(model, X, y):
y_pred = model.predict(X)
mse = mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)
return y_pred, mse, mae, r2

# regression_predictor(model,x_test,y_test)



Binary file modified q07_regression_pred/tests/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q07_regression_pred/tests/__pycache__/test.cpython-36.pyc
Binary file not shown.
Binary file modified q08_linear_model/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q08_linear_model/__pycache__/build.cpython-36.pyc
Binary file not shown.
18 changes: 17 additions & 1 deletion q08_linear_model/build.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# %load q08_linear_model/build.py
import pandas as pd
import numpy as np
from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data
Expand All @@ -7,7 +8,6 @@
from greyatomlib.multivariate_regression_project.q06_cross_validation.build import cross_validation_regressor
from greyatomlib.multivariate_regression_project.q07_regression_pred.build import regression_predictor


df = load_data('data/student-mat.csv')
x_train, x_test, y_train, y_test = split_dataset(df)
x_train,x_test = label_encode(x_train,x_test)
Expand All @@ -16,5 +16,21 @@
y_pred, mse, mae, r2 = regression_predictor(model, x_test, y_test)

# Write your code below

def linear_model(x_train, x_test, y_train, y_test):

G = linear_regression(x_train, y_train)
y_pred, rmse, mae, r2 = regression_predictor(G, x_test, y_test)

val = cross_validation_regressor(model, x_train, y_train)
stats = pd.DataFrame([(val,mae,mse,r2)], columns = ['cross_val','mae','rmse','r2'])

# print (G, y_pred, stats)
return G, y_pred, stats

# linear_model(x_train, x_test, y_train, y_test)





Binary file modified q08_linear_model/tests/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q08_linear_model/tests/__pycache__/test.cpython-36.pyc
Binary file not shown.
Binary file modified q09_advanced_model_q01_lasso/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q09_advanced_model_q01_lasso/__pycache__/build.cpython-36.pyc
Binary file not shown.
17 changes: 16 additions & 1 deletion q09_advanced_model_q01_lasso/build.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# %load q09_advanced_model_q01_lasso/build.py
from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data

from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset
Expand All @@ -15,9 +16,23 @@
df = load_data('data/student-mat.csv')

x_train, x_test, y_train, y_test = split_dataset(df)

x_train,x_test = label_encode(x_train,x_test)

# Write your solution here

def lasso(x_train, x_test, y_train, y_test,alpha=0.1):
G = Lasso(alpha = alpha)
G.fit(x_train, y_train)
val = cross_validation_regressor(G,x_train,y_train)

y_pred, mse, mae, r2 = regression_predictor(G, x_test, y_test)
stats = pd.DataFrame([(val,mae,r2,np.sqrt(mse))], columns = ['cross_val','mae','r2','rmse'])

# print (G, y_pred, stats)

return G, y_pred, stats

# lasso(x_train,x_test,y_train,y_test)



Binary file not shown.
Binary file modified q09_advanced_model_q01_lasso/tests/__pycache__/test.cpython-36.pyc
Binary file not shown.
Binary file modified q09_advanced_model_q02_ridge/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q09_advanced_model_q02_ridge/__pycache__/build.cpython-36.pyc
Binary file not shown.
22 changes: 19 additions & 3 deletions q09_advanced_model_q02_ridge/build.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# %load q09_advanced_model_q02_ridge/build.py
from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data

from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset
Expand All @@ -13,13 +14,28 @@
np.random.seed(9)

df = load_data('data/student-mat.csv')

x_train, x_test, y_train, y_test = split_dataset(df)

x_train, x_test, y_train, y_test = split_dataset(df)
x_train,x_test = label_encode(x_train,x_test)

# Write your code below


def ridge(x_train,x_test,y_train,y_test,alpha=0.1):
ridge_regressor = Ridge(alpha=alpha,normalize=True)
ridge_regressor.fit(x_train,y_train)
y_pred,mse,mae,r2 = regression_predictor(ridge_regressor,x_test,y_test)
val = cross_validation_regressor(ridge_regressor,x_train,y_train)

scores = pd.DataFrame()
scores['cross_val'] = pd.Series(val)
scores['mae']=pd.Series(mae)
scores['r2']=pd.Series(r2)
scores['mse']= pd.Series(mse**0.5)

# print (ridge_regressor,y_pred,scores)

return ridge_regressor,y_pred,scores

# ridge(x_train,x_test,y_train,y_test,0.1)


Binary file not shown.
Binary file modified q09_advanced_model_q02_ridge/tests/__pycache__/test.cpython-36.pyc
Binary file not shown.
Binary file modified q10_data_missing_values/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q10_data_missing_values/__pycache__/build.cpython-36.pyc
Binary file not shown.
14 changes: 14 additions & 0 deletions q10_data_missing_values/build.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# %load q10_data_missing_values/build.py
from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data
from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset
from greyatomlib.multivariate_regression_project.q03_data_encoding.build import label_encode
Expand All @@ -10,4 +11,17 @@
x_train,x_test = label_encode(x_train,x_test)

# Write your code below

def describe_df(x_train):
describe = x_train.describe()
value_counts = x_train.apply(pd.value_counts)

# print (describe, value_counts)

return describe, value_counts

# describe_df(x_train)




Binary file modified q10_data_missing_values/tests/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q10_data_missing_values/tests/__pycache__/test.cpython-36.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
20 changes: 19 additions & 1 deletion q11_feature_selection_q01_plot_corr/build.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# %load q11_feature_selection_q01_plot_corr/build.py

import matplotlib.pyplot as plt
from matplotlib.pyplot import yticks, xticks, subplots, set_cmap
Expand All @@ -21,6 +22,23 @@

#Remember to concatenate training features and labels if you want to check that scatterplots which I would prefer.You are free to explore labels to labels, features to features ,etc scatterplots as you want by passing arguments
#============================================================================
#visualise_data(pd.concat([x_train,y_train],axis=1),"../images/data_image.png")
#visualise_data(pd.concat([x_train,y_train],axis=1),'../images/data_image.png')

# Write your solution here:

def plot_corr(data, size=11):
fig, ax = plt.subplots(figsize=(size, size))
corr = data.corr()
plt.set_cmap(cmap='YlOrRd')
ax.matshow(corr)

plt.xticks(range(len(corr.columns)), corr.columns, rotation=90);
plt.yticks(range(len(corr.columns)), corr.columns);
plt.show()

return

# plot_corr(df,size=11)



Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
25 changes: 22 additions & 3 deletions q11_feature_selection_q02_best_k_features/build.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# %load q11_feature_selection_q02_best_k_features/build.py
# Default imports
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_regression
Expand All @@ -14,13 +15,31 @@
df = load_data('data/student-mat.csv')

x_train, x_test, y_train, y_test = split_dataset(df)

x_train,x_test = label_encode(x_train,x_test)


np.random.seed(9)

# Write your code below

def percentile_k_features(x_train, y_train, k=50):
selector = SelectPercentile(f_regression,percentile=k)
selector.fit_transform(x_train, y_train)
scores = selector.scores_[selector.get_support()]

features = x_train.columns.values[selector.get_support()]
features_scores_list = list(zip(features,scores))

df = pd.DataFrame(features_scores_list, columns=['Features','Scores'])

sort_list = df.sort_values('Scores',ascending=False)

top_k_predictors = list(sort_list['Features'])

# print (top_k_predictors)
return top_k_predictors

# percentile_k_features(x_train, y_train, k=50)





Expand Down
Binary file not shown.
Binary file not shown.
Binary file modified q12_feature_selection/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified q12_feature_selection/__pycache__/build.cpython-36.pyc
Binary file not shown.
Loading