In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [None]:
df = pd.read_pickle("/content/data_with_mp.pkl")
df = df[~df.index.duplicated(keep='first')]
original_df = df

In [None]:
columns_to_keep = ['income_ratio','expenses_ratio','total_ratio','market_performance']
df = df.loc[:, columns_to_keep]
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,income_ratio,expenses_ratio,total_ratio,market_performance
ticker,year,quarter,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MSFT,2013,1,7e-06,2.2e-05,2.9e-05,9.003151
MSFT,2013,2,7e-06,1.6e-05,2.3e-05,-2.685384
MSFT,2013,3,8e-06,1.9e-05,2.7e-05,4.801043
MSFT,2013,4,6e-06,1.4e-05,2e-05,6.418472
MSFT,2014,1,6e-06,1.5e-05,2.1e-05,3.987568


In [None]:
df['shift_by_5Q'] = df['market_performance'].shift(+5)
df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,income_ratio,expenses_ratio,total_ratio,market_performance,shift_by_5Q
ticker,year,quarter,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MSFT,2013,1,7e-06,2.2e-05,2.9e-05,9.003151,
MSFT,2013,2,7e-06,1.6e-05,2.3e-05,-2.685384,
MSFT,2013,3,8e-06,1.9e-05,2.7e-05,4.801043,
MSFT,2013,4,6e-06,1.4e-05,2e-05,6.418472,
MSFT,2014,1,6e-06,1.5e-05,2.1e-05,3.987568,
MSFT,2014,2,5e-06,1e-05,1.5e-05,11.318261,9.003151
MSFT,2014,3,5e-06,1.3e-05,1.8e-05,-10.254045,-2.685384
MSFT,2014,4,5e-06,1.1e-05,1.6e-05,-3.535956,4.801043
MSFT,2015,1,5e-06,1.3e-05,1.8e-05,12.915192,6.418472
MSFT,2015,2,5e-06,1.1e-05,1.6e-05,20.781172,3.987568


In [None]:
df.dropna(inplace=True)

In [None]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,income_ratio,expenses_ratio,total_ratio,market_performance,shift_by_5Q
ticker,year,quarter,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MSFT,2014,2,0.000005,0.000010,0.000015,11.318261,9.003151
MSFT,2014,3,0.000005,0.000013,0.000018,-10.254045,-2.685384
MSFT,2014,4,0.000005,0.000011,0.000016,-3.535956,4.801043
MSFT,2015,1,0.000005,0.000013,0.000018,12.915192,6.418472
MSFT,2015,2,0.000005,0.000011,0.000016,20.781172,3.987568
...,...,...,...,...,...,...,...
CHSCP,2022,2,0.000014,0.000121,0.000134,-3.370349,-8.118336
CHSCP,2022,3,0.000016,0.000089,0.000105,-5.312781,-2.155689
CHSCP,2022,4,0.000014,0.000150,0.000165,-4.413229,0.732678
CHSCP,2023,1,0.000016,0.000093,0.000109,-4.112975,18.032175


# Splitting the data

In [None]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
slr = LinearRegression()

slr.fit(X_train, y_train)

y_pred = slr.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared Score:", r2)

Mean Squared Error: 2893.5078899330492
R-squared Score: -0.0024426143247471277


# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
def update_market_performance(mp):
    if mp > 0:
        return 1
    elif mp < -0:
        return -1
    else:
        return 0

df['shift_by_5Q'] = df['shift_by_5Q'].apply(update_market_performance)

In [None]:
X = df.iloc[:,:-2]
y = df.iloc[:,-1]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Create a Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Train the model on the training datab
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5567742932179824


# kNN


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


In [None]:
k = 10
knn = KNeighborsClassifier(n_neighbors=k)

# Train the classifier on the training data
knn.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = knn.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5497454881999074


# SVM


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [None]:
# Create an SVM classifier
svm_clf = SVC()

# Train the classifier on the training data
svm_clf.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = svm_clf.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.5463135068153655


# Combination of models

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


In [None]:
log_reg = LogisticRegression(max_iter=1000)
svm_clf = SVC()
rf_clf = RandomForestClassifier()
ada_clf = AdaBoostClassifier()

In [None]:
# Create a voting classifier combining all the models
voting_clf = VotingClassifier(estimators=[('lr', log_reg), ('svm', svm_clf), ('rf', rf_clf), ('ada', ada_clf)], voting='hard')

# Train the voting classifier on the training data
voting_clf.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = voting_clf.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5567742932179824
