# Machine Learning Models - LRA,LDA,MLP

### Prepare the Dataset

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import matplotlib.ticker as mtick
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

In [2]:
df = pd.read_csv('ML_dataset.csv')
df.head()

Unnamed: 0,industry,wr1,wr2,wr3,wr4,vol1,vol2,vol3,vol4,pe_ratio,...,profit_margin,free_cash_margin,volatility,cpi,interest_rate,unemployment_rate,sma,rsi,ema,label
0,Consumer Discretionary,-0.019375,0.042774,-0.009555,-0.007692,6230100.0,6996900.0,6377700.0,7026700.0,30.01,...,14.4,13.74,0.025251,241.428,0.4,5.0,56.24,8.149406,56.3,0
1,Utilities,0.025644,0.011924,-0.006826,0.03147,175800.0,195300.0,233900.0,221400.0,17.58,...,11.15,1.49,0.013118,238.132,0.36,5.0,73.803333,87.79661,73.714583,0
2,Real Estate,0.057182,-0.004482,-0.040843,0.036994,1298900.0,3493600.0,571100.0,832500.0,62.13,...,12.97,29.98,0.019104,237.838,0.13,5.0,23.196667,88.26087,23.240833,0
3,Industrials,-0.044857,0.062964,0.055392,-0.058078,1071300.0,880100.0,1080400.0,994800.0,6.68,...,27.43,-18.14,0.02365,305.691,5.08,3.5,235.736667,11.037986,235.240833,0
4,Consumer Staples,0.022806,-0.021995,0.021842,-0.028,6563300.0,6745100.0,9474700.0,10172500.0,10.28,...,4.55,10.21,0.019597,259.101,0.1,10.2,63.633333,72.44898,63.837083,1


In [3]:
from sklearn.preprocessing import StandardScaler

In [4]:
# Separate the y variable, the labels
y = df["label"]

# Separate the X variable, the features
X = df.drop(columns="label")

# One-hot encode the 'industry' column
X = pd.get_dummies(X, columns=['industry'])

X.head()

# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [5]:
X_test_scaled

array([[-1.43861084,  0.67699166, -1.54774223, ..., -0.46708284,
        -0.1649277 , -0.32487997],
       [-0.04102101, -3.88491824, -0.29083965, ..., -0.46708284,
        -0.1649277 , -0.32487997],
       [ 2.43334306, -0.64663245,  0.08355601, ..., -0.46708284,
        -0.1649277 , -0.32487997],
       ...,
       [ 0.81094694,  1.61955143, -1.10620282, ..., -0.46708284,
        -0.1649277 , -0.32487997],
       [ 0.32292332,  1.07607365,  0.80893086, ..., -0.46708284,
        -0.1649277 , -0.32487997],
       [-1.13732523, -0.50039014,  0.53990795, ..., -0.46708284,
        -0.1649277 , -0.32487997]])

In [6]:
X_train_scaled

array([[ 0.83361291,  2.12312145, -0.41307886, ...,  2.14094782,
        -0.1649277 , -0.32487997],
       [ 0.8678758 ,  0.62914211, -1.65863345, ..., -0.46708284,
        -0.1649277 , -0.32487997],
       [ 0.15542089,  0.04777525,  0.10395843, ..., -0.46708284,
        -0.1649277 ,  3.07805987],
       ...,
       [-0.27804892,  0.22364082, -0.23826848, ..., -0.46708284,
        -0.1649277 , -0.32487997],
       [ 0.38400483, -0.44375139, -1.17226013, ...,  2.14094782,
        -0.1649277 , -0.32487997],
       [-0.15575346,  0.21847308,  1.36326248, ..., -0.46708284,
        -0.1649277 , -0.32487997]])

## Fit Models
Now we will test a range of models. In each we will fit the model in the train data, make predictons for the test data and obtain the accuracy. In later steps we will compare the accuracy of all the models. We will use primarily the library ```sklearn```

### Logistics Regression

In [7]:
model = LogisticRegression()

In [8]:
model.fit(X_train_scaled, y_train)

In [9]:
model.predict(X_test_scaled)

array([1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1,

In [10]:
model.score(X_test_scaled, y_test)

0.5323590814196242

In [11]:
model.predict_proba(X_test_scaled)

array([[0.38344101, 0.61655899],
       [0.37556574, 0.62443426],
       [0.47435351, 0.52564649],
       [0.49680026, 0.50319974],
       [0.46441815, 0.53558185],
       [0.50314661, 0.49685339],
       [0.40750709, 0.59249291],
       [0.45330914, 0.54669086],
       [0.52968897, 0.47031103],
       [0.53191953, 0.46808047],
       [0.41948154, 0.58051846],
       [0.40308721, 0.59691279],
       [0.50307315, 0.49692685],
       [0.47368292, 0.52631708],
       [0.55040759, 0.44959241],
       [0.49815161, 0.50184839],
       [0.46738476, 0.53261524],
       [0.36143123, 0.63856877],
       [0.49936558, 0.50063442],
       [0.47304022, 0.52695978],
       [0.481564  , 0.518436  ],
       [0.45781303, 0.54218697],
       [0.46626812, 0.53373188],
       [0.37739862, 0.62260138],
       [0.54675402, 0.45324598],
       [0.52897726, 0.47102274],
       [0.48000918, 0.51999082],
       [0.57350744, 0.42649256],
       [0.34251872, 0.65748128],
       [0.46743901, 0.53256099],
       [0.

In [16]:
LR_model = LogisticRegression(random_state=1234 , solver='lbfgs')
LR_model = LR_model.fit(X_train_scaled, y_train)
y_pred_LR = LR_model.predict(X_test_scaled)
Accuracy_LR = metrics.accuracy_score(y_test, y_pred_LR)
print("LR Accuracy:",Accuracy_LR)

LR Accuracy: 0.5323590814196242


### Linear Discrimant Analysis

In [17]:
LDA_model = LinearDiscriminantAnalysis()
LDA_model.fit(X_train_scaled,y_train)
y_pred_LDA = LDA_model.predict(X_test_scaled)
Accuracy_LDA = metrics.accuracy_score(y_test, y_pred_LDA)
print("LDA Accuracy:",Accuracy_LDA)

LDA Accuracy: 0.5302713987473904


### Neural Network

In [29]:
for i in range(10,101,5):

    MLP_model = MLPClassifier(hidden_layer_sizes=(i,i,i), activation='tanh', solver='adam', max_iter=1500)
    MLP_model.fit(X_train_scaled, y_train)
    y_pred_MLP = MLP_model.predict(X_test_scaled)
    Accuracy_MLP = metrics.accuracy_score(y_test, y_pred_MLP)
    print("MLP Accuracy:",Accuracy_MLP)
    print(f"Hidden layer values:{i,i,i}")



MLP Accuracy: 0.48643006263048016
Hidden layer values:(10, 10, 10)
MLP Accuracy: 0.46764091858037576
Hidden layer values:(15, 15, 15)
MLP Accuracy: 0.5073068893528184
Hidden layer values:(20, 20, 20)
MLP Accuracy: 0.511482254697286
Hidden layer values:(25, 25, 25)
MLP Accuracy: 0.5198329853862212
Hidden layer values:(30, 30, 30)
MLP Accuracy: 0.5073068893528184
Hidden layer values:(35, 35, 35)
MLP Accuracy: 0.4989561586638831
Hidden layer values:(40, 40, 40)
MLP Accuracy: 0.48643006263048016
Hidden layer values:(45, 45, 45)
MLP Accuracy: 0.4989561586638831
Hidden layer values:(50, 50, 50)
MLP Accuracy: 0.4906054279749478
Hidden layer values:(55, 55, 55)
MLP Accuracy: 0.5135699373695198
Hidden layer values:(60, 60, 60)
MLP Accuracy: 0.5386221294363257
Hidden layer values:(65, 65, 65)
MLP Accuracy: 0.5052192066805845
Hidden layer values:(70, 70, 70)
MLP Accuracy: 0.5156576200417536
Hidden layer values:(75, 75, 75)
MLP Accuracy: 0.5260960334029228
Hidden layer values:(80, 80, 80)
MLP Accu

In [30]:
for i in range(10,101,5):

    MLP_model = MLPClassifier(hidden_layer_sizes=(i,i,i), activation='relu', solver='adam', max_iter=1500)
    MLP_model.fit(X_train_scaled, y_train)
    y_pred_MLP = MLP_model.predict(X_test_scaled)
    Accuracy_MLP = metrics.accuracy_score(y_test, y_pred_MLP)
    print("MLP Accuracy:",Accuracy_MLP)
    print(f"Hidden layer values:{i,i,i}")

MLP Accuracy: 0.5031315240083507
Hidden layer values:(10, 10, 10)
MLP Accuracy: 0.4968684759916493
Hidden layer values:(15, 15, 15)
MLP Accuracy: 0.48851774530271397
Hidden layer values:(20, 20, 20)
MLP Accuracy: 0.55741127348643
Hidden layer values:(25, 25, 25)
MLP Accuracy: 0.5031315240083507
Hidden layer values:(30, 30, 30)
MLP Accuracy: 0.4968684759916493
Hidden layer values:(35, 35, 35)
MLP Accuracy: 0.5031315240083507
Hidden layer values:(40, 40, 40)
MLP Accuracy: 0.5302713987473904
Hidden layer values:(45, 45, 45)
MLP Accuracy: 0.5302713987473904
Hidden layer values:(50, 50, 50)
MLP Accuracy: 0.534446764091858
Hidden layer values:(55, 55, 55)
MLP Accuracy: 0.5010438413361169
Hidden layer values:(60, 60, 60)
MLP Accuracy: 0.49269311064718163
Hidden layer values:(65, 65, 65)
MLP Accuracy: 0.5177453027139874
Hidden layer values:(70, 70, 70)
MLP Accuracy: 0.49478079331941544
Hidden layer values:(75, 75, 75)
MLP Accuracy: 0.511482254697286
Hidden layer values:(80, 80, 80)
MLP Accurac

In [48]:
MLP_model = MLPClassifier(hidden_layer_sizes=(25,25,25), activation='relu', solver='adam', max_iter=1500)
MLP_model.fit(X_train_scaled, y_train)
y_pred_MLP = MLP_model.predict(X_test_scaled)
Accuracy_MLP = metrics.accuracy_score(y_test, y_pred_MLP)
print("MLP Accuracy:",Accuracy_MLP)


MLP Accuracy: 0.5198329853862212


## Compare Results

In [None]:
accuracy_list = [Accuracy_LR, Accuracy_LDA, Accuracy_MLP]

model_list = ['Logistic Regression', 'Linear Discriminat','Neural Network']

df_accuracy = pd.DataFrame({'Model': model_list, 'Accuracy': accuracy_list})

In [None]:
order = list(df_accuracy.sort_values('Accuracy', ascending=False).Model)
df_accuracy = df_accuracy.sort_values('Accuracy', ascending=False).reset_index().drop(['index'], axis=1)

plt.figure(figsize=(12,8))
# make barplot and sort bars
x = sns.barplot(x='Model', y="Accuracy", data=df_accuracy, order = order, palette="rocket")
plt.xlabel("Model", fontsize=20)
plt.ylabel("Accuracy", fontsize=20)
plt.title("Accuracy by Model", fontsize=20)
plt.grid(linestyle='-', linewidth='0.5', color='grey')
plt.xticks(rotation=70, fontsize=12)
plt.ylim(0,1)
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1))

for i in range(len(model_list)):
    plt.text(x = i, y = df_accuracy.loc[i, 'Accuracy'] + 0.05, s = str(round((df_accuracy.loc[i, 'Accuracy'])*100, 2))+'%', 
             fontsize = 14, color='black',horizontalalignment='center')

#y_value=['{:,.2f}'.format(x) + '%' for x in ax.get_yticks()]
#ax.set_yticklabels(y_value)

plt.tight_layout()