In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import r2_score
reg_model = LinearRegression()
reg_model.fit(X_train, y_train)
y_pred= reg_model.predict(X_test)
x_pred= reg_model.predict(X_train)
reg_model_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred})
reg_model_diff

mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

print('Mean Absolute Error:', mae)
print('Mean Square Error:', mse)
print('Root Mean Square Error:', r2)

print("R squared: {}".format(r2_score(y_true=y_train,y_pred=y_pred)))

# Assumption
# 1. Mean of residual - mean of the residuals should be zero
residuals = y_train.values-y_pred
mean_residuals = np.mean(residuals)
print("Mean of Residuals {}".format(mean_residuals))

# 2.Check for Homoscedasticity
p = sns.scatterplot(y_pred,residuals)
plt.xlabel('y_pred/predicted values')
plt.ylabel('Residuals')
plt.ylim(-10,10)
plt.xlim(0,26)
p = sns.lineplot([0,26],[0,0],color='blue')
p = plt.title('Residuals vs fitted values plot for homoscedasticity check')

# 3. Check for Normality of error terms/residuals
p = sns.distplot(residuals,kde=True)
p = plt.title('Normality of error terms/residuals')

# 4. No autocorrelation of residuals - There should not be autocorrelation in the data so the error terms should not form any pattern
plt.figure(figsize=(10,5))
p = sns.lineplot(y_pred,residuals,marker='o',color='blue')
plt.xlabel('y_pred/predicted values')
plt.ylabel('Residuals')
plt.ylim(-10,10)
plt.xlim(0,26)
p = sns.lineplot([0,26],[0,0],color='red')
p = plt.title('Residuals vs fitted values plot for autocorrelation check')




In [None]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeRegressor

dec_tree = DecisionTreeRegressor(random_state=0)
dec_tree.fit(X_train,y_train)
dec_tree_y_pred = dec_tree.predict(X_train)
print("Accuracy: {}".format(dec_tree.score(X_train,y_train)))
print("R squared: {}".format(r2_score(y_true=y_train,y_pred=dec_tree_y_pred)))

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
y_pred = tree_clf.fit(x_test)
accuracy_score(y_test, y_pred)

from sklearn import tree
plt.figure(figsize = (20, 10))
tree.plot_tree(dt, feature_names = X_train.columns.values);
# plt.savefig('sampleTree.jpeg)


params = {"criterion":("gini", "entropy"),
          "splitter":("best", "random"),
          "max_depth":(list(range(1, 20))),
          "min_samples_split":[2, 3, 4],
          "min_samples_leaf":list(range(1, 20))
          }


import graphviz
from six import StringIO
from IPython.display import Image
import pydotplus
import pydot
from sklearn.tree import export_graphviz

dot_data = StringIO()
export_graphviz(dt, out_file = dot_data,
                filled = True, rounded = True,
                special_characters = True)
graph = pydotplus.graphviz.graph_from_dot_data(dot_data.getvalue())

Image(graph.create_png())



In [None]:
# RandomForest
from sklearn.ensemble import RandomForestRegressor

rf_tree = RandomForestRegressor(random_state=0)
rf_tree.fit(X_train,y_train)
rf_tree_y_pred = rf_tree.predict(X_train)
print("Accuracy: {}".format(rf_tree.score(X_train,y_train)))
print("R squared: {}".format(r2_score(y_true=y_train,y_pred=rf_tree_y_pred)))

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf_clf = RandomForestClassifier(n_estimators=1000, random_state=42)
rf_clf.fit(X_train, y_train)

n_estimators = [500, 900, 1100, 1500]
max_features = ['auto', 'sqrt']
max_depth = [2, 3, 5, 10, 15, None]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

params_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf

In [None]:
from sklearn.svm import SVR

svr = SVR()
svr.fit(X_train,y_train)
svr_y_pred = svr.predict(X_train)
print("Accuracy: {}".format(svr.score(X_train,y_train)))
print("R squared: {}".format(r2_score(y_true=y_train,y_pred=svr_y_pred)))

from sklearn.svm import SVC

svm_clf = SVC(kernel='rbf', gamma=0.1, C=1.0)
svm_clf.fit(X_train, y_train)

params = {"C":(0.1, 0.5, 1, 2, 5, 10, 20),
          "gamma":(0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1),
          "kernel":('linear', 'poly', 'rbf')}

In [None]:
# Classification
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train, y_train)

from sklearn.metrics import precision_recall_curve, auc
precision, recall, thresholds = precision_recall_curve(y_test, y_predlr)
auprc = auc(recall, precision)

# Plotting AUPRC
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()

print('AUPRC = ', auprc)


In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)

# HyperTunning
train_score = []
test_score = []
neighbors = range(1, 30)

for k in neighbors:
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    train_score.append(accuracy_score(y_train, model.predict(X_train)))


  plt.figure(figsize=(10, 7))

  plt.plot(neighbors, train_score, label="Train score")
  # plt.plot(neighbors, test_score, label="Test score")
  plt.xticks(np.arange(1, 21, 1))
  plt.xlabel("Number of neighbors")
  plt.ylabel("Model score")
  plt.legend()

In [None]:
# Regression - Ridge and Lasso

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score
ridge = Ridge(alpha=.3) #coefficients are prevented to become too big by this alpha value
ridge.fit(X_train,y_train)
for i,col in enumerate(X_train.columns):
    print ("Ridge model coefficients for {} is {}:".format(col,ridge.coef_[0][i]))

print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))


lasso = Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
for i,col in enumerate(X_train):
    print ("Lasso model coefficients for {} is {}:".format(col,lasso.coef_[i]))

print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))


from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train2, y_train)
y_pred = lr.predict(x_test2)
y_predr1 = lr.predict(x_train2)
# Transform data using fitted StandardScaler

x_test_transformed = pd.DataFrame(sc.transform(x_test), index = x_test.index, columns = x_test.columns)

# Retrieve coefficient from LogisticRegression model
LogisticCoeff = pd.concat([pd.DataFrame(x_test_transformed.columns), pd.DataFrame(np.transpose(lr.coef_))], axis=1)
LogisticCoeff.columns = ['Variable', 'Coefficient']
LogisticCoeff


lr = LogisticRegression(penalty = 'l1', solver = 'liblinear')
lr.fit(x_train2, y_train)
y_predL1 = lr.predict(x_test2)
y_predli = lr.predict(x_train2)

x_test_transformed = pd.DataFrame(sc.transform(x_test), index = x_test.index, columns = x_test.columns)
LassoCoeff = pd.concat([pd.DataFrame(x_test_transformed.columns), pd.DataFrame(np.transpose(lr.coef_))], axis=1)

LassoCoeff.columns = ['Variable', 'Coefficient']
LassoCoeff


In [None]:
# Clustering
from sklearn.cluster import KMeans

ks = range(1,6)
inertias = []

for k in ks:
    model = KMeans(n_clusters = k) # Create a KMean instance with K clusters: model
    model.fit(samples) # Fit models to samples
    inertias.append(model.inertia_) # Append the inertia to list of inertias

inertias # Inertia is an error, also known as wcss
plt.plot(ks, inertias, '-o')
plt.xlabel('No. of Clusters, k');
plt.ylabel('Inertia');
plt.xticks(ks);
plt.title('Elbow Curve');

model = KMeans(n_clusters = 3)
labels = model.fit_predict(samples)
labels

df = pd.DataFrame({'labels': labels, 'varieties': varieties})
ct = pd.crosstab(df['labels'], df['varieties'])
ct


import scipy.cluster.hierarchy as shc

plt.figure(figsize=(10, 7))
plt.title("Customer Dendograms")
dend = shc.dendrogram(shc.linkage(data, method='ward'))

from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
labels_=cluster.fit_predict(data)

plt.figure(figsize=(10, 7))
plt.scatter(data[:,0], data[:,1], c=cluster.labels_, cmap='rainbow')