In [None]:
import time
import warnings
import random
import pandas as pd
import datetime
import lightgbm as lgb
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
import xgboost as xgb
from xgboost import plot_importance
from scipy.stats import spearmanr
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
def evaluate(alg,X_test,y_test):
    y_test_pred = alg.predict(X_test)
    print('Test MAE: {:0.4f}'.format(metrics.mean_absolute_error(y_test, y_test_pred)))
    print('Test MSE: {:0.4f}'.format(metrics.mean_squared_error(y_test, y_test_pred)))
    print('Test RMSE: {:0.4f}'.format(metrics.mean_squared_error(y_test, y_test_pred, squared=False)))
    print('Test R2: {:0.4f}'.format(metrics.r2_score(y_test, y_test_pred)))    

In [None]:
def modelfit(alg, X_train, y_train, useTrainCV=True, cv_folds=5, early_stopping_rounds=10):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train, label=y_train)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='rmse', early_stopping_rounds=early_stopping_rounds, verbose_eval=False)
        print(cvresult['test-rmse-mean'].min())
        print(cvresult['test-rmse-mean'].argmin())
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(X_train, y_train,eval_metric='rmse')
        
    #Predict training set:
    evaluate(alg,X_train,y_train)

In [None]:
df = pd.read_csv("udemy/course_all_features.csv")
df.shape

In [None]:
text_features = df.iloc[:, 19:3859]
text_features = StandardScaler().fit_transform(text_features)
pca = PCA(n_components=0.95)

In [None]:
new_text_features = pca.fit_transform(text_features)

In [None]:
new_text_df = pd.DataFrame(new_text_features)
new_text_df.shape

In [None]:
rest_features = df2.iloc[:, 0:18]
new_df = pd.concat([rest_features, new_text_df],axis=1)
new_df.shape

In [None]:
reviews = np.log2(new_df['num_monthly_reviews'])
enrollments = np.log2(new_df['num_monthly_enrollments'])

features = new_df.drop(['courseID','num_monthly_reviews','num_monthly_enrollemnts'],1)
features = pd.get_dummies(data=features, columns=["category","instructionsl_level","published_year","published_month"])
features.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, reviews, test_size=0.2, random_state=121212)
X_train.shape, X_test.shape

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred =  lr.predict(X_train)
print('Training MAE: {:0.4f}'.format(metrics.mean_absolute_error(y_train, y_pred)))
print('Training MSE: {:0.4f}'.format(metrics.mean_squared_error(y_train, y_pred, squared=False)))
evaluate(lr,X_test,y_test)

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 600, num = 3)]
# Number of features to consider at every split
n_test_features = features.shape[1]/2 
max_features = ['auto', 'sqrt', n_test_features]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 50, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
best_random = rf_random.best_estimator_
rf_random.best_params_, rf_random.best_estimator_

In [None]:
y_pred = best_random.predict(X_train)
y_test_pred = best_random.predict(X_test)
print('Test MAE: {:0.4f}'.format(metrics.mean_absolute_error(y_test, y_test_pred)))
print('Test MSE: {:0.4f}'.format(metrics.mean_squared_error(y_test, y_test_pred)))
print('Test RMSE: {:0.4f}'.format(metrics.mean_squared_error(y_test, y_test_pred, squared=False)))
print('Test R2: {:0.4f}'.format(metrics.r2_score(y_test, y_test_pred)))

In [None]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [15, 20],
    'max_features': [600, 700, 800],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2, 3, 4],
    'n_estimators': [400, 500, 600]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)
grid_search.fit(X_train,y_train)
grid_search.best_params_, grid_search.best_score_

In [None]:
#reviews
rf = RandomForestRegressor(max_depth=20,max_features=700,min_samples_leaf=1, min_samples_split=2,n_estimators=500,bootstrap=True,n_jobs=-1)
rf.fit(X_train,y_train)
evaluate(rf,X_test,y_test)

In [None]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = xgb.XGBRegressor(learning_rate =0.1, n_estimators=444, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,nthread=4, scale_pos_weight=1, seed=42), 
 param_grid = param_test1,scoring='neg_root_mean_squared_error',n_jobs=4,cv=10)
gsearch1.fit(X_train,y_train)
gsearch1.best_params_, gsearch1.best_score_

In [None]:
param_test2 = {
 'max_depth':[4,5,6],
 'min_child_weight':[4,5,6]
}
gsearch1 = GridSearchCV(estimator = xgb.XGBRegressor(learning_rate =0.1, n_estimators=444, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,nthread=4, scale_pos_weight=1, seed=42), 
 param_grid = param_test2,scoring='neg_root_mean_squared_error',n_jobs=4,cv=10)
#gsearch1.fit(X_train,y_train)
gsearch1.best_params_, gsearch1.best_score_

In [None]:
param_test2b = {
 'min_child_weight':[6,8,10]
}
gsearch2b = GridSearchCV(estimator = xgb.XGBRegressor(learning_rate =0.1, n_estimators=444, max_depth=6,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,nthread=4, scale_pos_weight=1, seed=42), 
 param_grid = param_test2b,scoring='neg_root_mean_squared_error',n_jobs=4,cv=10)
gsearch2b.fit(X_train,y_train)
gsearch2b.best_params_, gsearch2b.best_score_

In [None]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = xgb.XGBRegressor(learning_rate =0.1, n_estimators=444, max_depth=6,
 min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,nthread=4, scale_pos_weight=1, seed=42), 
 param_grid = param_test3,scoring='neg_root_mean_squared_error',n_jobs=4,cv=10)
gsearch3.fit(X_train,y_train)
gsearch3.best_params_, gsearch3.best_score_

In [None]:
param_test4 = {
 'subsample':[i/10.0 for i in range(4,11)],
 'colsample_bytree':[i/10.0 for i in range(4,11)]
}
gsearch4 = GridSearchCV(estimator = xgb.XGBRegressor(learning_rate =0.1, n_estimators=260, max_depth=6,
 min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,nthread=4, scale_pos_weight=1, seed=42), 
 param_grid = param_test4,scoring='neg_root_mean_squared_error',n_jobs=4,cv=10)
gsearch4.fit(X_train,y_train)
gsearch4.best_params_, gsearch4.best_score_

In [None]:
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
 'reg_lambda':[1e-5, 1e-2, 0.1, 1, 100]   
}
gsearch6 = GridSearchCV(estimator = xgb.XGBRegressor(learning_rate =0.1, n_estimators=260, max_depth=6,
 min_child_weight=6, gamma=0, subsample=0.85, colsample_bytree=0.7,nthread=4, scale_pos_weight=1, seed=42), 
 param_grid = param_test6,scoring='neg_root_mean_squared_error',n_jobs=4,cv=10)
gsearch6.fit(X_train,y_train)
gsearch6.best_params_, gsearch5.best_score_

In [None]:
xgb_best = xgb.XGBRegressor(
                 gamma=0,                 
                 learning_rate=0.01,
                 max_depth=6,
                 min_child_weight=1,
                 n_estimators=5000,                                                                    
                 colsample_bytree=0.6,
                 subsample=1,
                 seed=42) 
modelfit(xgb_best, X_train, y_train)
evaluate(xgb_best,X_test,y_test)

In [None]:
sc_X = StandardScaler()
X_trainscaled=sc_X.fit_transform(X_train)
X_testscaled=sc_X.transform(X_test)

In [None]:
mlp = MLPRegressor(hidden_layer_sizes=(64,32,16),activation="relu" ,random_state=42, max_iter=2000,early_stopping=True,n_iter_no_change=10)
mlp.fit(X_trainscaled, y_train)
evaluate(mlp,X_testscaled,y_test)

In [None]:
param_grid = {
    'hidden_layer_sizes': [(64,32,16), (512,256,128), (2048,1024,512), (512,256,128,64)],
    'activation': ['relu','tanh','identity'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
    'learning_rate':['constant','adaptive']
}

In [None]:
mlp = MLPRegressor(hidden_layer_sizes=(512,256,128,64),activation="relu" ,alpha=0.0001, solver='adam',random_state=42, max_iter=2000,early_stopping=True,n_iter_no_change=10)
grid = GridSearchCV(mlp, param_grid, n_jobs= -1, cv=5)
grid.fit(X_trainscaled, y_train)
print(grid.best_params_) 

In [None]:
best_mlp = grid.best_estimator_
evaluate(best_mlp,X_testscaled,y_test)

In [None]:
#reviews
mlp = MLPRegressor(hidden_layer_sizes=(2048,1024,512),activation="relu" ,random_state=42, max_iter=2000,early_stopping=True,learning_rate='constant',n_iter_no_change=10)
mlp1f.fit(X_trainscaled, y_train)
evaluate(mlp1f,X_testscaled,y_test)