In [None]:
# SKLEARN TUTORIAL
# ALGORITHMS TO LEARN: 
# GENERALIZED LINEAR MODELS
    # LINEAR REGRESSION
    # LOGISTIC REGRESSION
# CLUSTERING METHODS: 
    # K-MEANS CLUSTERING

In [None]:
# LINEAR REGRESSION TUTORIAL

# LOAD DEPENDENCIES
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

# LOAD DATASET
xl = pd.ExcelFile(r"C:\Users\Dell\Desktop\Training Classes\Python Bootcamp\Sales_train_test.xlsx")
df = xl.parse("Sheet1")
df

# INITIATE LINEAR REGRESSION
from sklearn.linear_model import LinearRegression
lm = LinearRegression()

# SPLIT DATA INTO TEST AND TRAIN
train = df[df.set == 'Train']
test = df[df.set == 'Test']

# SELECT PREDICTORS
independent_variables = train.drop(['sales_a', 'period.1', 'set'], axis = 1)

# PERFORM LINEAR REGRESSION (TRAIN)
lm.fit (independent_variables, train.sales_a)

# MODEL COMPONENTS
# INTERCEPT
lm.intercept_ 
# COEFFICIENTS
lm.coef_ 
pd.DataFrame(list(zip(independent_variables.columns, lm.coef_)), columns = ['features', 'coefficients'])

# MODEL FIT
lm.score(independent_variables, train.sales_a) # R-SQUARED

# PERFORM PREDICTIONS
lm.predict(independent_variables) # BACKTEST ON TRAINING DATA

# PREPARE TESTING SET
independent_variables_2 = test.drop(['sales_a', 'period.1', 'set'], axis = 1)

# RUN TEST PREDICTIONS
lm.score(independent_variables_2, test.sales_a) # R-SQUARED
lm.predict(independent_variables_2)

# GRAPH USING MATPLOTLIB

# VISUALIZE PREDICTIONS
prediction = pd.DataFrame(lm.predict(independent_variables))
prediction.columns = ['pred_y']
output = train.join(prediction, how='inner')
plt.scatter(output.index, output.sales_a)
plt.scatter(output.index, output.pred_y)

# TRAINING SET
plt.scatter(lm.predict(independent_variables), train.sales_a)
plt.xlabel("Predicted Sales")
plt.ylabel("Actual Sales")
plt.title("Predicted vs. Actual Sales")

# TESTING SET
plt.scatter(lm.predict(independent_variables_2), test.sales_a)
plt.xlabel("Predicted Sales")
plt.ylabel("Actual Sales")
plt.title("Predicted vs. Actual Sales")

# METRICS
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

mean_squared_error(y_true, y_pred)
mean_absolute_error(y_true, y_pred)
r2_score(y_true, y_pred)

In [None]:
# LOGISTIC REGRESSION TUTORIAL

# INITIATE LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()

# LOAD DATASET
xl = pd.ExcelFile(r"C:\Users\Dell\Desktop\Training Classes\Python Bootcamp\usage_data.xlsx")
df = xl.parse("Sheet1")
df

# TEST / TRAIN SPLIT
train = df[df.set == 'Train']
test = df[df.set == 'Test']

# INITIATE INDEPENDENT VARIABLES
independent_variables = train.drop(['record', 'use','set'], axis = 1)

# IMPLEMENT LOGISTIC REGRESSION
lg.fit (independent_variables, train.use)

# MODEL COMPONENTS
lg.intercept_ # INTERCEPT
lg.coef_ # COEFFICIENTS

# ACCURACY
lg.score(independent_variables, train.use) # MEAN ACCURACY

# PERFORM PREDICTIONS
lg.predict(independent_variables) # BACKTEST ON TRAINING DATA
lg.predict_proba(independent_variables) # OUTPUT CASE PROBABILITIES

# PREPARE TESTING SET
independent_variables_2 = test.drop(['record', 'use','set'], axis = 1)
lg.predict(independent_variables_2) # PREDICT TESTING CASES
lg.predict_proba(independent_variables_2) # OUTPUT CASE PROBABILITIES
lg.score(independent_variables_2, test.use) # MEAN ACCURACY

In [None]:
# K-MEANS TUTORIAL

# INITIATE KMEANS
from sklearn.cluster import KMeans
km = KMeans(n_clusters=3)

# LOAD DATASET
xl = pd.ExcelFile(r"C:\Users\Dell\Desktop\Training Classes\Python Bootcamp\clustering_data.xlsx")
df = xl.parse("Sheet1")
df1 = xl.parse("Sheet2")

# TRAIN / TEST SET
train_set = df.drop(['record_id'], axis = 1)
test_set = df1.drop(['record_id'], axis = 1)

# PERFORM KMEANS
km.fit(train_set)

# CLUSTER LABELS / CENTERS
km.labels_
km.cluster_centers_

# PREDICT CLUSTER
km.predict(test_set)

# CREATE OUTPUT
output = km.predict(test_set)
output1 = pd.DataFrame(output, columns = ['cluster'])
output2 = df1.join(output1, how='inner')
output2.to_csv('clustering_output1.csv')

# PLOT CLUSTERS
ds0 = output2[output2.cluster==0]
ds1 = output2[output2.cluster==1]
ds2 = output2[output2.cluster==2]
ds3 = pd.DataFrame(km.cluster_centers_, columns = ['x_axis', 'y_axis'])
plt.scatter(ds0.x_axis, ds0.y_axis)
plt.scatter(ds1.x_axis, ds1.y_axis)
plt.scatter(ds2.x_axis, ds2.y_axis)

# MEASURE FIT 

from sklearn.metrics import silhouette_score
silhouette_score(km.fit_transform(train_set), km.labels_)

In [None]:
# MDS TUTORIAL

# INITIATE MDS
from sklearn.manifold import MDS
md = MDS(n_components = 2, n_init = 1, dissimilarity = 'precomputed')

# LOAD DATASET
xl = pd.ExcelFile(r"C:\Users\Dell\Desktop\Training Classes\Python Bootcamp\correlation_matrix.xlsx")
df = xl.parse("Sheet1")

# TRANSFORM MATRIX
df1 = md.fit_transform(df)

# VISUALIZE MATRIX
df2 = pd.DataFrame(df1, columns = ['x_axis', 'y_axis'])
df3 = pd.DataFrame(df.index, columns = ['label'])
df4 = df2.join(df3, how='inner')
plt.scatter(df4.x_axis, df4.y_axis)
for i, txt in enumerate(df4.label):
    plt.annotate(txt, (df4.x_axis[i], df4.y_axis[i]))