diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc index b0c3719..423888a 100644 Binary files a/__pycache__/__init__.cpython-36.pyc and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_load_data/__pycache__/__init__.cpython-36.pyc b/q01_load_data/__pycache__/__init__.cpython-36.pyc index 4596200..cedda30 100644 Binary files a/q01_load_data/__pycache__/__init__.cpython-36.pyc and b/q01_load_data/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_load_data/__pycache__/build.cpython-36.pyc b/q01_load_data/__pycache__/build.cpython-36.pyc index 98e98a7..8c55f6b 100644 Binary files a/q01_load_data/__pycache__/build.cpython-36.pyc and b/q01_load_data/__pycache__/build.cpython-36.pyc differ diff --git a/q01_load_data/build.py b/q01_load_data/build.py index 7cd3700..421b7a2 100644 --- a/q01_load_data/build.py +++ b/q01_load_data/build.py @@ -1,4 +1,11 @@ +# %load q01_load_data/build.py import pandas as pd # Write your code below - +# path = 'data/student-mat.csv' + +def load_data(path): + return pd.read_csv(path, sep = ';') +# load_data(path) + + diff --git a/q01_load_data/tests/__pycache__/__init__.cpython-36.pyc b/q01_load_data/tests/__pycache__/__init__.cpython-36.pyc index d07fd2f..6e6da12 100644 Binary files a/q01_load_data/tests/__pycache__/__init__.cpython-36.pyc and b/q01_load_data/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_load_data/tests/__pycache__/test.cpython-36.pyc b/q01_load_data/tests/__pycache__/test.cpython-36.pyc index 9aa6996..d4d2004 100644 Binary files a/q01_load_data/tests/__pycache__/test.cpython-36.pyc and b/q01_load_data/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q02_data_split/__pycache__/__init__.cpython-36.pyc b/q02_data_split/__pycache__/__init__.cpython-36.pyc index 5d17273..1056891 100644 Binary files a/q02_data_split/__pycache__/__init__.cpython-36.pyc and b/q02_data_split/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_split/__pycache__/build.cpython-36.pyc b/q02_data_split/__pycache__/build.cpython-36.pyc index e6bd2eb..76b1e39 100644 Binary files a/q02_data_split/__pycache__/build.cpython-36.pyc and b/q02_data_split/__pycache__/build.cpython-36.pyc differ diff --git a/q02_data_split/build.py b/q02_data_split/build.py index c2e7147..fac2332 100644 --- a/q02_data_split/build.py +++ b/q02_data_split/build.py @@ -1,8 +1,20 @@ +# %load q02_data_split/build.py from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data -from sklearn.model_selection import train_test_split +from sklearn.model_selection import train_test_split as tts import pandas as pd df = load_data('data/student-mat.csv') # Write your code below +def split_dataset(df): + X = df.drop('G3',axis=1) + y = df['G3'] + + X_train,X_test,y_train,y_test = tts(X,y,train_size = 0.8, random_state = 7) + + return X_train, X_test, y_train, y_test + +# split_dataset(df) + + diff --git a/q02_data_split/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_split/tests/__pycache__/__init__.cpython-36.pyc index e780e63..ac870a9 100644 Binary files a/q02_data_split/tests/__pycache__/__init__.cpython-36.pyc and b/q02_data_split/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_split/tests/__pycache__/test.cpython-36.pyc b/q02_data_split/tests/__pycache__/test.cpython-36.pyc index a1b3fc5..0a76868 100644 Binary files a/q02_data_split/tests/__pycache__/test.cpython-36.pyc and b/q02_data_split/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q03_data_encoding/__pycache__/__init__.cpython-36.pyc b/q03_data_encoding/__pycache__/__init__.cpython-36.pyc index 884722b..66fab60 100644 Binary files a/q03_data_encoding/__pycache__/__init__.cpython-36.pyc and b/q03_data_encoding/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_data_encoding/__pycache__/build.cpython-36.pyc b/q03_data_encoding/__pycache__/build.cpython-36.pyc index 302366c..18640a0 100644 Binary files a/q03_data_encoding/__pycache__/build.cpython-36.pyc and b/q03_data_encoding/__pycache__/build.cpython-36.pyc differ diff --git a/q03_data_encoding/build.py b/q03_data_encoding/build.py index bb4c8ca..283b82b 100644 --- a/q03_data_encoding/build.py +++ b/q03_data_encoding/build.py @@ -1,3 +1,4 @@ +# %load q03_data_encoding/build.py from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset from sklearn.preprocessing import LabelEncoder @@ -5,10 +6,19 @@ import pandas as pd df = load_data('data/student-mat.csv') -x_train, x_test, y_train, y_test = split_dataset(df) +X_train, X_test, y_train, y_test = split_dataset(df) # Write your code below +def label_encode(X_train,X_test): + + le = LabelEncoder() + + X_train = X_train.apply(le.fit_transform) + X_test = X_test.apply(le.fit_transform) + return X_train, X_test + +# label_encode(X_train,X_test) diff --git a/q03_data_encoding/tests/__pycache__/__init__.cpython-36.pyc b/q03_data_encoding/tests/__pycache__/__init__.cpython-36.pyc index 7d18c18..c084ebf 100644 Binary files a/q03_data_encoding/tests/__pycache__/__init__.cpython-36.pyc and b/q03_data_encoding/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_data_encoding/tests/__pycache__/test.cpython-36.pyc b/q03_data_encoding/tests/__pycache__/test.cpython-36.pyc index 8ade2b7..6faf62b 100644 Binary files a/q03_data_encoding/tests/__pycache__/test.cpython-36.pyc and b/q03_data_encoding/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q03_ohe_encoder/__pycache__/__init__.cpython-36.pyc b/q03_ohe_encoder/__pycache__/__init__.cpython-36.pyc index e4ec35b..97cc71b 100644 Binary files a/q03_ohe_encoder/__pycache__/__init__.cpython-36.pyc and b/q03_ohe_encoder/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_ohe_encoder/__pycache__/build.cpython-36.pyc b/q03_ohe_encoder/__pycache__/build.cpython-36.pyc index 1433b7b..dc2b783 100644 Binary files a/q03_ohe_encoder/__pycache__/build.cpython-36.pyc and b/q03_ohe_encoder/__pycache__/build.cpython-36.pyc differ diff --git a/q03_ohe_encoder/build.py b/q03_ohe_encoder/build.py index 36e4b90..5c66ce2 100644 --- a/q03_ohe_encoder/build.py +++ b/q03_ohe_encoder/build.py @@ -1,19 +1,29 @@ +# %load q03_ohe_encoder/build.py from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset +from greyatomlib.multivariate_regression_project.q03_data_encoding.build import label_encode from sklearn.preprocessing import OneHotEncoder import pandas as pd import numpy as np df = load_data('data/student-mat.csv') -x_train, x_test, y_train, y_test = split_dataset(df) +X_train, X_test, y_train, y_test = split_dataset(df) category_index = [x for x in range(len(df.columns)) if df[df.columns[x]].dtype == 'object'] # Write your code below - +def ohe_encode(X_train,X_test,category_index=category_index): + X_train,X_test=label_encode(X_train,X_test) + ohe = OneHotEncoder(categorical_features=category_index,sparse=False) + ohe.fit(X_train) + ohe.fit(X_test) + X_train = ohe.transform(X_train) + X_test = ohe.transform(X_test) - + return pd.DataFrame(X_train),pd.DataFrame(X_test) - +# ohe_encode(X_train,X_test,category_index) + + diff --git a/q03_ohe_encoder/tests/__pycache__/__init__.cpython-36.pyc b/q03_ohe_encoder/tests/__pycache__/__init__.cpython-36.pyc index 8c87a88..0e77827 100644 Binary files a/q03_ohe_encoder/tests/__pycache__/__init__.cpython-36.pyc and b/q03_ohe_encoder/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_ohe_encoder/tests/__pycache__/test.cpython-36.pyc b/q03_ohe_encoder/tests/__pycache__/test.cpython-36.pyc index 1956a19..8ee1d1d 100644 Binary files a/q03_ohe_encoder/tests/__pycache__/test.cpython-36.pyc and b/q03_ohe_encoder/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q04_data_visualisation/__pycache__/__init__.cpython-36.pyc b/q04_data_visualisation/__pycache__/__init__.cpython-36.pyc index d44a511..179b2e1 100644 Binary files a/q04_data_visualisation/__pycache__/__init__.cpython-36.pyc and b/q04_data_visualisation/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_data_visualisation/__pycache__/build.cpython-36.pyc b/q04_data_visualisation/__pycache__/build.cpython-36.pyc index 2bfbd4e..c689d6a 100644 Binary files a/q04_data_visualisation/__pycache__/build.cpython-36.pyc and b/q04_data_visualisation/__pycache__/build.cpython-36.pyc differ diff --git a/q04_data_visualisation/build.py b/q04_data_visualisation/build.py index 9c15ad9..e9907d6 100644 --- a/q04_data_visualisation/build.py +++ b/q04_data_visualisation/build.py @@ -1,16 +1,22 @@ -# -*- coding: utf-8 -*- +# %load q04_data_visualisation/build.py from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset from greyatomlib.multivariate_regression_project.q03_data_encoding.build import label_encode import matplotlib.pyplot as plt from pandas.plotting import scatter_matrix +# %matploylib.inline + data = load_data('data/student-mat.csv') -x_train, x_test, y_train, y_test = split_dataset(data) -x_train,x_test = label_encode(x_train,x_test) +X_train, X_test, y_train, y_test = split_dataset(data) +X_train,X_test = label_encode(X_train,X_test) # Write your code below +def visualise_data(data,figname): + scatter_matrix(data, alpha=0.2, figsize=(15,15), diagonal='kde') + plt.show() - +# visualise_data(data,'figname') + diff --git a/q04_data_visualisation/tests/__pycache__/__init__.cpython-36.pyc b/q04_data_visualisation/tests/__pycache__/__init__.cpython-36.pyc index 6631d03..80e8a4e 100644 Binary files a/q04_data_visualisation/tests/__pycache__/__init__.cpython-36.pyc and b/q04_data_visualisation/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q04_data_visualisation/tests/__pycache__/test.cpython-36.pyc b/q04_data_visualisation/tests/__pycache__/test.cpython-36.pyc index 5353356..3788e3c 100644 Binary files a/q04_data_visualisation/tests/__pycache__/test.cpython-36.pyc and b/q04_data_visualisation/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q05_linear_regression_model/__pycache__/__init__.cpython-36.pyc b/q05_linear_regression_model/__pycache__/__init__.cpython-36.pyc index 06a2a9b..c1dab24 100644 Binary files a/q05_linear_regression_model/__pycache__/__init__.cpython-36.pyc and b/q05_linear_regression_model/__pycache__/__init__.cpython-36.pyc differ diff --git a/q05_linear_regression_model/__pycache__/build.cpython-36.pyc b/q05_linear_regression_model/__pycache__/build.cpython-36.pyc index c40d112..59af312 100644 Binary files a/q05_linear_regression_model/__pycache__/build.cpython-36.pyc and b/q05_linear_regression_model/__pycache__/build.cpython-36.pyc differ diff --git a/q05_linear_regression_model/build.py b/q05_linear_regression_model/build.py index 7a0a243..10abaa7 100644 --- a/q05_linear_regression_model/build.py +++ b/q05_linear_regression_model/build.py @@ -1,3 +1,4 @@ +# %load q05_linear_regression_model/build.py from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset @@ -6,10 +7,17 @@ df = load_data('data/student-mat.csv') -x_train, x_test, y_train, y_test = split_dataset(df) +X_train, X_test, y_train, y_test = split_dataset(df) -x_train, x_test = label_encode(x_train,x_test) +X_train, X_test = label_encode(X_train,X_test) # Write your code below - +def linear_regression(X,y): + linreg = LinearRegression() + model = linreg.fit(X,y) + return model + +# linear_regression(X_train,y_train) + + diff --git a/q05_linear_regression_model/tests/__pycache__/__init__.cpython-36.pyc b/q05_linear_regression_model/tests/__pycache__/__init__.cpython-36.pyc index 296bcce..4256677 100644 Binary files a/q05_linear_regression_model/tests/__pycache__/__init__.cpython-36.pyc and b/q05_linear_regression_model/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q05_linear_regression_model/tests/__pycache__/test.cpython-36.pyc b/q05_linear_regression_model/tests/__pycache__/test.cpython-36.pyc index 54551b9..5f175d4 100644 Binary files a/q05_linear_regression_model/tests/__pycache__/test.cpython-36.pyc and b/q05_linear_regression_model/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q06_cross_validation/__pycache__/__init__.cpython-36.pyc b/q06_cross_validation/__pycache__/__init__.cpython-36.pyc index 9a1c3aa..dde2f98 100644 Binary files a/q06_cross_validation/__pycache__/__init__.cpython-36.pyc and b/q06_cross_validation/__pycache__/__init__.cpython-36.pyc differ diff --git a/q06_cross_validation/__pycache__/build.cpython-36.pyc b/q06_cross_validation/__pycache__/build.cpython-36.pyc index 2e1c378..2305854 100644 Binary files a/q06_cross_validation/__pycache__/build.cpython-36.pyc and b/q06_cross_validation/__pycache__/build.cpython-36.pyc differ diff --git a/q06_cross_validation/build.py b/q06_cross_validation/build.py index 406a734..d2d414d 100644 --- a/q06_cross_validation/build.py +++ b/q06_cross_validation/build.py @@ -1,3 +1,4 @@ +# %load q06_cross_validation/build.py from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset @@ -18,4 +19,12 @@ model =linear_regression(x_train,y_train) # Write your code below + +def cross_validation_regressor(model, X,y): + r2_score = cross_val_score(model,X,y,scoring='r2') + return r2_score.mean() + +# cross_validation_regressor(model, x_train,y_train) + + diff --git a/q06_cross_validation/tests/__pycache__/__init__.cpython-36.pyc b/q06_cross_validation/tests/__pycache__/__init__.cpython-36.pyc index b571b36..d696aaa 100644 Binary files a/q06_cross_validation/tests/__pycache__/__init__.cpython-36.pyc and b/q06_cross_validation/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q06_cross_validation/tests/__pycache__/test.cpython-36.pyc b/q06_cross_validation/tests/__pycache__/test.cpython-36.pyc index e065247..b41c136 100644 Binary files a/q06_cross_validation/tests/__pycache__/test.cpython-36.pyc and b/q06_cross_validation/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q07_regression_pred/__pycache__/__init__.cpython-36.pyc b/q07_regression_pred/__pycache__/__init__.cpython-36.pyc index 3e7e467..1d41f19 100644 Binary files a/q07_regression_pred/__pycache__/__init__.cpython-36.pyc and b/q07_regression_pred/__pycache__/__init__.cpython-36.pyc differ diff --git a/q07_regression_pred/__pycache__/build.cpython-36.pyc b/q07_regression_pred/__pycache__/build.cpython-36.pyc index dfa0411..d8af45c 100644 Binary files a/q07_regression_pred/__pycache__/build.cpython-36.pyc and b/q07_regression_pred/__pycache__/build.cpython-36.pyc differ diff --git a/q07_regression_pred/build.py b/q07_regression_pred/build.py index 3f2eee3..bc844fb 100644 --- a/q07_regression_pred/build.py +++ b/q07_regression_pred/build.py @@ -1,3 +1,4 @@ +# %load q07_regression_pred/build.py from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score @@ -20,3 +21,14 @@ # Write your code below +def regression_predictor(model, X, y): + + y_pred = model.predict(X) + mse = mean_squared_error(y, y_pred) + mae = mean_absolute_error(y, y_pred) + r2 = r2_score(y, y_pred) + return y_pred, mse, mae, r2 + +# regression_predictor(model, x_test, y_test) + + diff --git a/q07_regression_pred/tests/__pycache__/__init__.cpython-36.pyc b/q07_regression_pred/tests/__pycache__/__init__.cpython-36.pyc index f1435e5..624e1d0 100644 Binary files a/q07_regression_pred/tests/__pycache__/__init__.cpython-36.pyc and b/q07_regression_pred/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q07_regression_pred/tests/__pycache__/test.cpython-36.pyc b/q07_regression_pred/tests/__pycache__/test.cpython-36.pyc index 203c5ff..95935bd 100644 Binary files a/q07_regression_pred/tests/__pycache__/test.cpython-36.pyc and b/q07_regression_pred/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q08_linear_model/__pycache__/__init__.cpython-36.pyc b/q08_linear_model/__pycache__/__init__.cpython-36.pyc index b91b141..47d323d 100644 Binary files a/q08_linear_model/__pycache__/__init__.cpython-36.pyc and b/q08_linear_model/__pycache__/__init__.cpython-36.pyc differ diff --git a/q08_linear_model/__pycache__/build.cpython-36.pyc b/q08_linear_model/__pycache__/build.cpython-36.pyc index 438fb94..13ab053 100644 Binary files a/q08_linear_model/__pycache__/build.cpython-36.pyc and b/q08_linear_model/__pycache__/build.cpython-36.pyc differ diff --git a/q08_linear_model/build.py b/q08_linear_model/build.py index 85d49da..22f94d1 100644 --- a/q08_linear_model/build.py +++ b/q08_linear_model/build.py @@ -1,3 +1,4 @@ +# %load q08_linear_model/build.py import pandas as pd import numpy as np from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data @@ -17,4 +18,14 @@ # Write your code below - +def linear_model(x_train, x_test, y_train, y_test): + + G = linear_regression(x_train, y_train) + y_pred, rmse, mae, r2 = regression_predictor(G, x_test, y_test) + val = cross_validation_regressor(model, x_train, y_train) + stats = pd.DataFrame([(val,mae,mse,r2)], columns = ['cross_val','mae','rmse','r2']) + return G, y_pred, stats + +# linear_model(x_train, x_test, y_train, y_test) + + diff --git a/q08_linear_model/tests/__pycache__/__init__.cpython-36.pyc b/q08_linear_model/tests/__pycache__/__init__.cpython-36.pyc index 5f231d2..de7c851 100644 Binary files a/q08_linear_model/tests/__pycache__/__init__.cpython-36.pyc and b/q08_linear_model/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q08_linear_model/tests/__pycache__/test.cpython-36.pyc b/q08_linear_model/tests/__pycache__/test.cpython-36.pyc index cbaeda3..767e280 100644 Binary files a/q08_linear_model/tests/__pycache__/test.cpython-36.pyc and b/q08_linear_model/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q09_advanced_model_q01_lasso/__pycache__/__init__.cpython-36.pyc b/q09_advanced_model_q01_lasso/__pycache__/__init__.cpython-36.pyc index b8b8fc7..240d92c 100644 Binary files a/q09_advanced_model_q01_lasso/__pycache__/__init__.cpython-36.pyc and b/q09_advanced_model_q01_lasso/__pycache__/__init__.cpython-36.pyc differ diff --git a/q09_advanced_model_q01_lasso/__pycache__/build.cpython-36.pyc b/q09_advanced_model_q01_lasso/__pycache__/build.cpython-36.pyc index ad763a5..8b1b5d6 100644 Binary files a/q09_advanced_model_q01_lasso/__pycache__/build.cpython-36.pyc and b/q09_advanced_model_q01_lasso/__pycache__/build.cpython-36.pyc differ diff --git a/q09_advanced_model_q01_lasso/build.py b/q09_advanced_model_q01_lasso/build.py index c832d59..77fec51 100644 --- a/q09_advanced_model_q01_lasso/build.py +++ b/q09_advanced_model_q01_lasso/build.py @@ -1,3 +1,4 @@ +# %load q09_advanced_model_q01_lasso/build.py from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset @@ -20,4 +21,15 @@ # Write your solution here +def lasso(x_train, x_test, y_train, y_test,alpha=0.1): + G = Lasso(alpha = alpha) + G.fit(x_train, y_train) + val = cross_validation_regressor(G,x_train,y_train) + y_pred, mse, mae, r2 = regression_predictor(G, x_test, y_test) + stats = pd.DataFrame([(val,mae,r2,np.sqrt(mse))], columns = ['cross_val','mae','r2','rmse']) + return G, y_pred, stats + +# lasso(x_train, x_test, y_train, y_test) + + diff --git a/q09_advanced_model_q01_lasso/tests/__pycache__/__init__.cpython-36.pyc b/q09_advanced_model_q01_lasso/tests/__pycache__/__init__.cpython-36.pyc index 80296f7..580b97b 100644 Binary files a/q09_advanced_model_q01_lasso/tests/__pycache__/__init__.cpython-36.pyc and b/q09_advanced_model_q01_lasso/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q09_advanced_model_q01_lasso/tests/__pycache__/test.cpython-36.pyc b/q09_advanced_model_q01_lasso/tests/__pycache__/test.cpython-36.pyc index 3d92981..be1b7b7 100644 Binary files a/q09_advanced_model_q01_lasso/tests/__pycache__/test.cpython-36.pyc and b/q09_advanced_model_q01_lasso/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q09_advanced_model_q02_ridge/__pycache__/__init__.cpython-36.pyc b/q09_advanced_model_q02_ridge/__pycache__/__init__.cpython-36.pyc index 222893d..60aedf7 100644 Binary files a/q09_advanced_model_q02_ridge/__pycache__/__init__.cpython-36.pyc and b/q09_advanced_model_q02_ridge/__pycache__/__init__.cpython-36.pyc differ diff --git a/q09_advanced_model_q02_ridge/__pycache__/build.cpython-36.pyc b/q09_advanced_model_q02_ridge/__pycache__/build.cpython-36.pyc index 29083a5..5352fe7 100644 Binary files a/q09_advanced_model_q02_ridge/__pycache__/build.cpython-36.pyc and b/q09_advanced_model_q02_ridge/__pycache__/build.cpython-36.pyc differ diff --git a/q09_advanced_model_q02_ridge/build.py b/q09_advanced_model_q02_ridge/build.py index 0fb3e1a..14ee6e0 100644 --- a/q09_advanced_model_q02_ridge/build.py +++ b/q09_advanced_model_q02_ridge/build.py @@ -1,3 +1,4 @@ +# %load q09_advanced_model_q02_ridge/build.py from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset @@ -20,6 +21,16 @@ # Write your code below - +def ridge(x_train, x_test, y_train, y_test,alpha=0.1): + + G = Ridge(alpha = alpha, normalize= True, random_state=9) + G.fit(x_train, y_train) + val = cross_validation_regressor(G,x_train,y_train) + y_pred, mse, mae, r2 = regression_predictor(G, x_test, y_test) + stats = pd.DataFrame([(val,mae,r2,np.sqrt(mse))], columns = ['cross_val','mae','r2','rmse']) + return G, y_pred, stats +# ridge(x_train, x_test, y_train, y_test,alpha=0.1) + + diff --git a/q09_advanced_model_q02_ridge/tests/__pycache__/__init__.cpython-36.pyc b/q09_advanced_model_q02_ridge/tests/__pycache__/__init__.cpython-36.pyc index 602e1f5..cdef0de 100644 Binary files a/q09_advanced_model_q02_ridge/tests/__pycache__/__init__.cpython-36.pyc and b/q09_advanced_model_q02_ridge/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q09_advanced_model_q02_ridge/tests/__pycache__/test.cpython-36.pyc b/q09_advanced_model_q02_ridge/tests/__pycache__/test.cpython-36.pyc index 37f31c3..603ba9e 100644 Binary files a/q09_advanced_model_q02_ridge/tests/__pycache__/test.cpython-36.pyc and b/q09_advanced_model_q02_ridge/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q10_data_missing_values/__pycache__/__init__.cpython-36.pyc b/q10_data_missing_values/__pycache__/__init__.cpython-36.pyc index 9f50df2..3f4c741 100644 Binary files a/q10_data_missing_values/__pycache__/__init__.cpython-36.pyc and b/q10_data_missing_values/__pycache__/__init__.cpython-36.pyc differ diff --git a/q10_data_missing_values/__pycache__/build.cpython-36.pyc b/q10_data_missing_values/__pycache__/build.cpython-36.pyc index 5c075f4..5e9f8f8 100644 Binary files a/q10_data_missing_values/__pycache__/build.cpython-36.pyc and b/q10_data_missing_values/__pycache__/build.cpython-36.pyc differ diff --git a/q10_data_missing_values/build.py b/q10_data_missing_values/build.py index 582edbb..d84b74d 100644 --- a/q10_data_missing_values/build.py +++ b/q10_data_missing_values/build.py @@ -1,3 +1,4 @@ +# %load q10_data_missing_values/build.py from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset from greyatomlib.multivariate_regression_project.q03_data_encoding.build import label_encode @@ -11,3 +12,11 @@ # Write your code below +def describe_df(x_train): + describe = x_train.describe() + value_counts = x_train.apply(pd.value_counts) + return describe, value_counts + +# describe_df(x_train) + + diff --git a/q10_data_missing_values/tests/__pycache__/__init__.cpython-36.pyc b/q10_data_missing_values/tests/__pycache__/__init__.cpython-36.pyc index 2fdd38b..653358f 100644 Binary files a/q10_data_missing_values/tests/__pycache__/__init__.cpython-36.pyc and b/q10_data_missing_values/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q10_data_missing_values/tests/__pycache__/test.cpython-36.pyc b/q10_data_missing_values/tests/__pycache__/test.cpython-36.pyc index 1701926..0f864dd 100644 Binary files a/q10_data_missing_values/tests/__pycache__/test.cpython-36.pyc and b/q10_data_missing_values/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q11_feature_selection_q01_plot_corr/__pycache__/__init__.cpython-36.pyc b/q11_feature_selection_q01_plot_corr/__pycache__/__init__.cpython-36.pyc index 35c8cae..21a864e 100644 Binary files a/q11_feature_selection_q01_plot_corr/__pycache__/__init__.cpython-36.pyc and b/q11_feature_selection_q01_plot_corr/__pycache__/__init__.cpython-36.pyc differ diff --git a/q11_feature_selection_q01_plot_corr/__pycache__/build.cpython-36.pyc b/q11_feature_selection_q01_plot_corr/__pycache__/build.cpython-36.pyc index 35748ec..9555a6b 100644 Binary files a/q11_feature_selection_q01_plot_corr/__pycache__/build.cpython-36.pyc and b/q11_feature_selection_q01_plot_corr/__pycache__/build.cpython-36.pyc differ diff --git a/q11_feature_selection_q01_plot_corr/build.py b/q11_feature_selection_q01_plot_corr/build.py index 0427922..b9e1505 100644 --- a/q11_feature_selection_q01_plot_corr/build.py +++ b/q11_feature_selection_q01_plot_corr/build.py @@ -1,3 +1,4 @@ +# %load q11_feature_selection_q01_plot_corr/build.py import matplotlib.pyplot as plt from matplotlib.pyplot import yticks, xticks, subplots, set_cmap @@ -17,10 +18,36 @@ # ============================================================================= -# To visualise data, you need to pass training data only as the assumption holds that test set is unknown data and obviously,you cant not make decision based on unseen data :-p +# To visualise data, you need to pass training data only as the assumption holds +# that test set is unknown data and obviously,you cant not make decision based +# on unseen data :-p -#Remember to concatenate training features and labels if you want to check that scatterplots which I would prefer.You are free to explore labels to labels, features to features ,etc scatterplots as you want by passing arguments -#============================================================================ -#visualise_data(pd.concat([x_train,y_train],axis=1),"../images/data_image.png") +# Remember to concatenate training features and labels if you want to check that +# scatterplots which I would prefer.You are free to explore labels to labels, +# features to features ,etc scatterplots as you want by passing arguments +#============================================================================= +#visualise_data(pd.concat([x_train,y_train],axis=1),'../images/data_image.png') # Write your solution here: +# def plot_corr(data,size=11): +# import seaborn as sns + +# fig,ax = plt.subplots(figsize=(size,size)) +# corr = data.corr() +# sns.heatmap(corr,cmap='YlOrRd') +# plt.show() + +def plot_corr(data, size=11): + + fig, ax = plt.subplots(figsize=(size, size)) + corr = data.corr() + plt.set_cmap(cmap='YlOrRd') + ax.matshow(corr) + plt.xticks(range(len(corr.columns)), corr.columns, rotation=90); + plt.yticks(range(len(corr.columns)), corr.columns); + plt.show() + return + +# plot_corr(df,size=11) + + diff --git a/q11_feature_selection_q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc b/q11_feature_selection_q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc index 6c1c509..5c6ab40 100644 Binary files a/q11_feature_selection_q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc and b/q11_feature_selection_q01_plot_corr/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q11_feature_selection_q01_plot_corr/tests/__pycache__/test.cpython-36.pyc b/q11_feature_selection_q01_plot_corr/tests/__pycache__/test.cpython-36.pyc index 93b5347..24604f5 100644 Binary files a/q11_feature_selection_q01_plot_corr/tests/__pycache__/test.cpython-36.pyc and b/q11_feature_selection_q01_plot_corr/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q11_feature_selection_q02_best_k_features/__pycache__/__init__.cpython-36.pyc b/q11_feature_selection_q02_best_k_features/__pycache__/__init__.cpython-36.pyc index cce1771..6563a18 100644 Binary files a/q11_feature_selection_q02_best_k_features/__pycache__/__init__.cpython-36.pyc and b/q11_feature_selection_q02_best_k_features/__pycache__/__init__.cpython-36.pyc differ diff --git a/q11_feature_selection_q02_best_k_features/__pycache__/build.cpython-36.pyc b/q11_feature_selection_q02_best_k_features/__pycache__/build.cpython-36.pyc index b0c88c7..4c5cd11 100644 Binary files a/q11_feature_selection_q02_best_k_features/__pycache__/build.cpython-36.pyc and b/q11_feature_selection_q02_best_k_features/__pycache__/build.cpython-36.pyc differ diff --git a/q11_feature_selection_q02_best_k_features/build.py b/q11_feature_selection_q02_best_k_features/build.py index 95002c5..37651c0 100644 --- a/q11_feature_selection_q02_best_k_features/build.py +++ b/q11_feature_selection_q02_best_k_features/build.py @@ -1,3 +1,4 @@ +# %load q11_feature_selection_q02_best_k_features/build.py # Default imports from sklearn.feature_selection import SelectPercentile from sklearn.feature_selection import f_regression @@ -20,8 +21,18 @@ np.random.seed(9) # Write your code below - - +def percentile_k_features(x_train, y_train, k=50): + selector = SelectPercentile(f_regression,percentile=k) + selector.fit_transform(x_train, y_train) + scores = selector.scores_[selector.get_support()] + features = x_train.columns.values[selector.get_support()] + features_scores_list = list(zip(features,scores)) + df = pd.DataFrame(features_scores_list, columns=['Features','Scores']) + sorted_list = df.sort_values('Scores',ascending=False) + top_k_predictors = list(sorted_list['Features']) + return top_k_predictors + +# percentile_k_features(x_train, y_train, k=50) diff --git a/q11_feature_selection_q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc b/q11_feature_selection_q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc index 3a1830b..13e3cf9 100644 Binary files a/q11_feature_selection_q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc and b/q11_feature_selection_q02_best_k_features/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q11_feature_selection_q02_best_k_features/tests/__pycache__/test.cpython-36.pyc b/q11_feature_selection_q02_best_k_features/tests/__pycache__/test.cpython-36.pyc index 7c11282..276e402 100644 Binary files a/q11_feature_selection_q02_best_k_features/tests/__pycache__/test.cpython-36.pyc and b/q11_feature_selection_q02_best_k_features/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q12_feature_selection/__pycache__/__init__.cpython-36.pyc b/q12_feature_selection/__pycache__/__init__.cpython-36.pyc index 886fe32..cb7e565 100644 Binary files a/q12_feature_selection/__pycache__/__init__.cpython-36.pyc and b/q12_feature_selection/__pycache__/__init__.cpython-36.pyc differ diff --git a/q12_feature_selection/__pycache__/build.cpython-36.pyc b/q12_feature_selection/__pycache__/build.cpython-36.pyc index 7c97eeb..86542aa 100644 Binary files a/q12_feature_selection/__pycache__/build.cpython-36.pyc and b/q12_feature_selection/__pycache__/build.cpython-36.pyc differ diff --git a/q12_feature_selection/build.py b/q12_feature_selection/build.py index 1bbe2b2..bf88c03 100644 --- a/q12_feature_selection/build.py +++ b/q12_feature_selection/build.py @@ -1,3 +1,4 @@ +# %load q12_feature_selection/build.py # import matplotlib.pyplot as plt from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset @@ -16,3 +17,12 @@ x_train,x_test = label_encode(x_train,x_test) # Write your code below + +def feature_selection(x_train, y_train, k=50): + plot_corr(x_train) + reduced_features = percentile_k_features(x_train,y_train,k) + return reduced_features + +# feature_selection(x_train, y_train, k=50) + + diff --git a/q12_feature_selection/tests/__pycache__/__init__.cpython-36.pyc b/q12_feature_selection/tests/__pycache__/__init__.cpython-36.pyc index 199811e..5eef781 100644 Binary files a/q12_feature_selection/tests/__pycache__/__init__.cpython-36.pyc and b/q12_feature_selection/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q12_feature_selection/tests/__pycache__/test.cpython-36.pyc b/q12_feature_selection/tests/__pycache__/test.cpython-36.pyc index 3a7de81..35d77b4 100644 Binary files a/q12_feature_selection/tests/__pycache__/test.cpython-36.pyc and b/q12_feature_selection/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q13_plot_residuals/__pycache__/__init__.cpython-36.pyc b/q13_plot_residuals/__pycache__/__init__.cpython-36.pyc index 339472d..4e48976 100644 Binary files a/q13_plot_residuals/__pycache__/__init__.cpython-36.pyc and b/q13_plot_residuals/__pycache__/__init__.cpython-36.pyc differ diff --git a/q13_plot_residuals/__pycache__/build.cpython-36.pyc b/q13_plot_residuals/__pycache__/build.cpython-36.pyc index b3cfbaf..88511ee 100644 Binary files a/q13_plot_residuals/__pycache__/build.cpython-36.pyc and b/q13_plot_residuals/__pycache__/build.cpython-36.pyc differ diff --git a/q13_plot_residuals/build.py b/q13_plot_residuals/build.py index 9cdb3e3..e25947e 100644 --- a/q13_plot_residuals/build.py +++ b/q13_plot_residuals/build.py @@ -1,5 +1,39 @@ +# %load q13_plot_residuals/build.py +from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data +from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset +from greyatomlib.multivariate_regression_project.q03_data_encoding.build import label_encode +from greyatomlib.multivariate_regression_project.q07_regression_pred.build import regression_predictor +from sklearn.linear_model import Ridge +import numpy as np +import pandas as pd -import matplotlib.pyplot as plt +from matplotlib import pyplot as plt + +from greyatomlib.multivariate_regression_project.q06_cross_validation.build import cross_validation_regressor +np.random.seed(9) + +df = load_data('data/student-mat.csv') + +x_train, x_test, y_train, y_test = split_dataset(df) + +x_train,x_test = label_encode(x_train,x_test) + +l2=Ridge(alpha=0.01) + +l2.fit(x_train,y_train) # Write your code below + +def plot_residuals(model, x_test, y_test): + y_pred, mse, mae, r2 = regression_predictor(model, x_test, y_test) + error_residuals=y_test-y_pred + plt.scatter(y_test,error_residuals) + plt.title('Residual Plot') + plt.xlabel('SalePrice') + plt.ylabel('Errors') + plt.show() + +# plot_residuals(l2, x_test, y_test) + + diff --git a/q13_plot_residuals/tests/__pycache__/__init__.cpython-36.pyc b/q13_plot_residuals/tests/__pycache__/__init__.cpython-36.pyc index 3aa40f0..de5efcf 100644 Binary files a/q13_plot_residuals/tests/__pycache__/__init__.cpython-36.pyc and b/q13_plot_residuals/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q13_plot_residuals/tests/__pycache__/test.cpython-36.pyc b/q13_plot_residuals/tests/__pycache__/test.cpython-36.pyc index 89ecb4e..b3b620a 100644 Binary files a/q13_plot_residuals/tests/__pycache__/test.cpython-36.pyc and b/q13_plot_residuals/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q14_benchmarking/__pycache__/__init__.cpython-36.pyc b/q14_benchmarking/__pycache__/__init__.cpython-36.pyc index 453edef..49d647a 100644 Binary files a/q14_benchmarking/__pycache__/__init__.cpython-36.pyc and b/q14_benchmarking/__pycache__/__init__.cpython-36.pyc differ diff --git a/q14_benchmarking/__pycache__/build.cpython-36.pyc b/q14_benchmarking/__pycache__/build.cpython-36.pyc index 28c02f8..0ec668f 100644 Binary files a/q14_benchmarking/__pycache__/build.cpython-36.pyc and b/q14_benchmarking/__pycache__/build.cpython-36.pyc differ diff --git a/q14_benchmarking/build.py b/q14_benchmarking/build.py index 4a4557b..62e1fcf 100644 --- a/q14_benchmarking/build.py +++ b/q14_benchmarking/build.py @@ -1,3 +1,4 @@ +# %load q14_benchmarking/build.py from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset @@ -26,5 +27,22 @@ # Write your code below +def create_stats(x_train, x_test, y_train, y_test): + + l1,y_pred_l1,lasso_stats = lasso(x_train, x_test, y_train, y_test, alpha=0.1) + l2,y_pred_l2,ridge_stats = ridge(x_train, x_test, y_train, y_test, alpha=0.1) + + features= feature_selection(x_train, y_train, k=50) + x_trainft=x_train[features].copy() + x_testft=x_test[features].copy() + + l1ft,y_pred_l1ft,lasso_statsft = lasso(x_trainft, x_testft, y_train, y_test, alpha=0.1) + l2ft,y_pred_l2ft,ridge_statsft = ridge(x_trainft, x_testft, y_train, y_test, alpha=0.1) + + complete_stats = pd.concat([lasso_stats,lasso_statsft,ridge_stats,ridge_statsft]) + + return complete_stats + +# create_stats(x_train, x_test, y_train, y_test) diff --git a/q14_benchmarking/tests/__pycache__/__init__.cpython-36.pyc b/q14_benchmarking/tests/__pycache__/__init__.cpython-36.pyc index defa63d..51a8870 100644 Binary files a/q14_benchmarking/tests/__pycache__/__init__.cpython-36.pyc and b/q14_benchmarking/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q14_benchmarking/tests/__pycache__/test.cpython-36.pyc b/q14_benchmarking/tests/__pycache__/test.cpython-36.pyc index cc77345..340c264 100644 Binary files a/q14_benchmarking/tests/__pycache__/test.cpython-36.pyc and b/q14_benchmarking/tests/__pycache__/test.cpython-36.pyc differ diff --git a/q15_select_best_model/__pycache__/build.cpython-36.pyc b/q15_select_best_model/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..ac81feb Binary files /dev/null and b/q15_select_best_model/__pycache__/build.cpython-36.pyc differ diff --git a/q15_select_best_model/build.py b/q15_select_best_model/build.py index d0ce1ba..6080fc8 100644 --- a/q15_select_best_model/build.py +++ b/q15_select_best_model/build.py @@ -1,3 +1,4 @@ +# %load q15_select_best_model/build.py from greyatomlib.multivariate_regression_project.q01_load_data.build import load_data from greyatomlib.multivariate_regression_project.q02_data_split.build import split_dataset @@ -21,3 +22,18 @@ # Write your code below +def complete_build(x_train, x_test, y_train, y_test): + + category_index = [x for x in range(len(x_train.columns)) if x_train[x_train.columns[x]].dtype == 'object'] + x_train_t,x_test_t=ohe_encode(x_train, x_test,category_index) + train=pd.DataFrame(x_train_t) + test=pd.DataFrame(x_test_t) + train.columns=x_train.columns.values + test.columns=x_test.columns.values + complete_stats1 = create_stats(x_train, x_test, y_train, y_test) + complete_stats = create_stats(train, test, y_train, y_test) + return pd.concat([complete_stats1,complete_stats],axis=0) + +# complete_build(x_train, x_test, y_train, y_test) + + diff --git a/q15_select_best_model/tests/__pycache__/__init__.cpython-36.pyc b/q15_select_best_model/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..1c7a525 Binary files /dev/null and b/q15_select_best_model/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q15_select_best_model/tests/__pycache__/test.cpython-36.pyc b/q15_select_best_model/tests/__pycache__/test.cpython-36.pyc new file mode 100644 index 0000000..126cb9f Binary files /dev/null and b/q15_select_best_model/tests/__pycache__/test.cpython-36.pyc differ