### Data Dictionary

grade: The grade in school of the student (most 15-year-olds in America are in 10th grade)

male: Whether the student is male (1/0)

raceeth: The race/ethnicity composite of the student

preschool: Whether the student attended preschool (1/0)

expectBachelors: Whether the student expects to obtain a bachelor's degree (1/0)

motherHS: Whether the student's mother completed high school (1/0)

motherBachelors: Whether the student's mother obtained a bachelor's degree (1/0)

motherWork: Whether the student's mother has part-time or full-time work (1/0)

fatherHS: Whether the student's father completed high school (1/0)

fatherBachelors: Whether the student's father obtained a bachelor's degree (1/0)

fatherWork: Whether the student's father has part-time or full-time work (1/0)

selfBornUS: Whether the student was born in the United States of America (1/0)

motherBornUS: Whether the student's mother was born in the United States of America (1/0)

fatherBornUS: Whether the student's father was born in the United States of America (1/0)

englishAtHome: Whether the student speaks English at home (1/0)

computerForSchoolwork: Whether the student has access to a computer for schoolwork (1/0)

read30MinsADay: Whether the student reads for pleasure for 30 minutes/day (1/0)

minutesPerWeekEnglish: The number of minutes per week the student spend in English class

studentsInEnglish: The number of students in this student's English class at school

schoolHasLibrary: Whether this student's school has a library (1/0)

publicSchool: Whether this student attends a public school (1/0)

urban: Whether this student's school is in an urban area (1/0)

schoolSize: The number of students in this student's school

readingScore: The student's reading score, on a 1000-point scale

In this homework assignment, 
we will predict the reading scores of students from the United States of America on the 2009 PISA exam.

### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from xgboost import to_graphviz, plot_importance

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import _hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingRegressor

%matplotlib inline
sns.set_style('dark')
sns.set(font_scale=1.5)

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report, mean_absolute_error, mean_squared_error,r2_score
from sklearn.metrics import plot_confusion_matrix, plot_precision_recall_curve, plot_roc_curve, accuracy_score

import keras
from keras.models import save_model, Sequential
from keras.layers import Activation, BatchNormalization, Dense
from keras.optimizers import Adam

import feature_engine.missing_data_imputers as mdi
from feature_engine.outlier_removers import Winsorizer

pd.options.display.max_columns= None
#pd.options.display.max_rows = None

### Data Exploration

In [None]:
df = pd.read_csv("pisa2009train.csv",low_memory=False)

In [None]:
df

In [None]:
df.info()

In [None]:
df.describe(include='all')

In [None]:
df.shape

In [None]:
df.columns

### Data Visualization

In [None]:
fig = plt.figure(figsize=(20,40))

plt.subplot(7,2,1)
sns.countplot(df.grade)
plt.subplot(7,2,2)
sns.countplot(df.male)
plt.subplot(7,2,3)
sns.countplot(y=df.raceeth)
plt.subplot(7,2,4)
sns.countplot(df.preschool)
plt.subplot(7,2,5)
sns.countplot(df.expectBachelors)
plt.subplot(7,2,6)
sns.countplot(df.motherHS)
plt.subplot(7,2,7)
sns.countplot(df.motherBachelors)
plt.subplot(7,2,8)
sns.countplot(df.motherWork)
plt.subplot(7,2,9)
sns.countplot(df.fatherHS)
plt.subplot(7,2,10)
sns.countplot(df.fatherBachelors)
plt.subplot(7,2,11)
sns.countplot(df.fatherWork)
plt.subplot(7,2,12)
sns.countplot(df.selfBornUS)
plt.subplot(7,2,13)
sns.countplot(df.motherBornUS)
plt.subplot(7,2,14)
sns.countplot(df.fatherBornUS)

plt.tight_layout()
plt.show()

In [None]:
df.columns

In [None]:
fig = plt.figure(figsize=(20,40))

plt.subplot(7,2,1)
sns.countplot(df.englishAtHome)
plt.subplot(7,2,2)
sns.countplot(df.computerForSchoolwork)
plt.subplot(7,2,3)
sns.countplot(df.read30MinsADay)
plt.subplot(7,2,4)
sns.distplot(df.minutesPerWeekEnglish)
plt.subplot(7,2,5)
sns.distplot(df.studentsInEnglish)
plt.subplot(7,2,6)
sns.countplot(df.schoolHasLibrary)
plt.subplot(7,2,7)
sns.countplot(df.publicSchool)
plt.subplot(7,2,8)
sns.countplot(df.urban)
plt.subplot(7,2,9)
sns.distplot(df.schoolSize)
plt.subplot(7,2,10)
sns.distplot(df.readingScore)
plt.subplot(7,2,11)
sns.barplot(x=df.minutesPerWeekEnglish,y=df.readingScore,ci=None)
plt.subplot(7,2,12)
sns.barplot(x=df.studentsInEnglish,y=df.readingScore,ci=None)
plt.subplot(7,2,13)
sns.barplot(x=df.schoolSize,y=df.readingScore,ci=None)
plt.subplot(7,2,14)
#sns.barplot(x=df.minutesPerWeekEnglish,y=df.readingScore)

plt.tight_layout()
plt.show()

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(25,16))
sns.heatmap(df.corr(),cmap="coolwarm",annot=True,fmt='.2f',linewidths=2)
plt.show()

In [None]:
sns.pairplot(df.sample(500))
plt.show()

### Data Preprocessing

### Treat Missing Values

In [None]:
df.isnull().sum()

In [None]:
imputer1 = mdi.FrequentCategoryImputer(variables=['raceeth'])

In [None]:
imputer1.fit(df)

In [None]:
imputer1.variables

In [None]:
df = imputer1.transform(df)

In [None]:
df['raceeth'].value_counts()

In [None]:
df.columns

In [None]:
imputer2 = mdi.ArbitraryNumberImputer(arbitrary_number=0.0,variables=['preschool', 'expectBachelors', 'motherHS',
       'motherBachelors', 'motherWork', 'fatherHS', 'fatherBachelors','fatherWork', 'selfBornUS', 'motherBornUS', 
       'fatherBornUS','englishAtHome', 'computerForSchoolwork', 'read30MinsADay','schoolHasLibrary',
       'publicSchool', 'urban'])

In [None]:
imputer2.fit(df)

In [None]:
df = imputer2.transform(df)

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
imputer3 = mdi.MeanMedianImputer(imputation_method='median',variables=['minutesPerWeekEnglish', 'studentsInEnglish',
                                                                      'schoolSize'])

In [None]:
imputer3.fit(df)

In [None]:
df = imputer3.transform(df)

In [None]:
df.isnull().sum()

In [None]:
df

### Treat Duplicate Values

In [None]:
df.duplicated(keep='first').sum()

In [None]:
# df[df.duplicated(keep=False)]

In [None]:
# df.drop_duplicates(ignore_index=True, inplace=True)

### Treat Outliers

In [None]:
df.describe()

In [None]:
windsorizer = Winsorizer(distribution='skewed',tail='both',fold=1.5,
                         variables=['minutesPerWeekEnglish', 'studentsInEnglish','schoolSize'])

In [None]:
windsorizer.fit(df)

In [None]:
df_t = windsorizer.transform(df)

In [None]:
df_t

In [None]:
df_t.describe()

In [None]:
windsorizer.left_tail_caps_

In [None]:
windsorizer.right_tail_caps_

### Treat Data Types

In [None]:
df.dtypes

In [None]:
#df.to_csv("pisatrain.csv",index=False)

### Feature Scaling

In [None]:
df

In [None]:
df.drop(['grade','raceeth'],axis=1,inplace=True)

In [None]:
df

In [None]:
df_num = df[['minutesPerWeekEnglish','studentsInEnglish','schoolSize']]

In [None]:
df_num

In [None]:
minmax = MinMaxScaler()

In [None]:
dfnumscaled = minmax.fit_transform(df_num)

In [None]:
dfnumscaled = pd.DataFrame(dfnumscaled,columns=df_num.columns)

In [None]:
dfnumscaled

In [None]:
df.drop(['minutesPerWeekEnglish','studentsInEnglish','schoolSize'],axis=1,inplace=True)

In [None]:
df

In [None]:
df2 = pd.concat([dfnumscaled,df],axis=1)

In [None]:
df2

### Create and save processed dataset

In [None]:
#df2.to_csv("pisatrain.csv",index=False)

### Model Training

In [None]:
X = df.iloc[:,0:6]
y = df.iloc[:,6]

In [None]:
X.values, y.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
xgb = XGBRegressor(random_state=0, n_estimators=100, objective='')

In [None]:
xgb.fit(X_train,y_train,eval_set=[(X_test,y_test)],eval_metric='',early_stopping_rounds=10)

In [None]:
y_pred = xgb.predict(X_test)

In [None]:
y_pred

### Model Evaluation

In [None]:
plot_confusion_matrix(xgb,X_test,y_test)
plt.show()

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
plot_roc_curve(xgb,X_test,y_test)
plt.show()

In [None]:
mse = mean_squared_error(y_test,y_pred)
mse

In [None]:
rmse = np.sqrt(mse)
rmse

In [None]:
r2score = r2_score(y_test,y_pred)
r2score

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
sns.regplot(x=y_test, y=y_pred, ax=ax)
plt.title("Plot to compare actual vs predicted")
plt.ylabel("Predicted")
plt.xlabel("Actual")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
plot_importance(xgb,ax=ax)
plt.show()

### Cross-Validation

In [None]:
cross_val_score(xgb,X,y,cv=5,verbose=2)

In [None]:
cv.mean()

### Feature Selection

### Model Prediction

In [None]:
testdata = pd.read_csv()

In [None]:
answer = xgb.predict(testdata)

In [None]:
answer

### Model Tuning

In [None]:
model = XGBClassifier()

In [None]:
parameters = {'max_depth': np.arange(3,10,1),
              'learning rate': np.arange(0.05,0.3,0.03),
              'n_estimators':np.arange(100,1000,100),
              'min_child_weight': np.arange(1,4,1),
              'gamma':np.arange(0,50,2),
              'subsample':np.arange(0.5,0.9,0.1),
              'colsample_bytree':np.arange(0.5,0.9,0.1)
             }

In [None]:
randm = RandomizedSearchCV(estimator=model, param_distributions = parameters, cv = 5, n_iter = 50, n_jobs=-1)

In [None]:
randm.fit(X_train, y_train)

In [None]:
randm.best_estimator_

In [None]:
randm.best_score_

In [None]:
randm.best_params_

### New Model

In [None]:
xgbnew = XGBClassifier(random_state=0, n_estimators=500, objective='binary:logistic',max_depth= 7,
                      gamma= 10, min_child_weight= 1)

In [None]:
xgbnew.fit(X_train,y_train,eval_set=[(X_test,y_test)],eval_metric='error',early_stopping_rounds=10)

In [None]:
y_pred_new = xgbnew.predict(X_test)

In [None]:
y_pred_new