<a href="https://colab.research.google.com/github/ebamberg/research-projects-ml/blob/main/Classic_models/Classifier_Financial_QA_10k.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Classification example

In [81]:
!pip install pandas scikit-learn nltk matplotlib seaborn



In [82]:
import pandas as pd

# Load dataset
data = pd.read_csv("Financial-QA-10k.csv")
data

Unnamed: 0,question,answer,context,ticker,filing
0,What area did NVIDIA initially focus on before...,NVIDIA initially focused on PC graphics.,"Since our original focus on PC graphics, we ha...",NVDA,2023_10K
1,What are some of the recent applications of GP...,Recent applications of GPU-powered deep learni...,Some of the most recent applications of GPU-po...,NVDA,2023_10K
2,What significant invention did NVIDIA create i...,NVIDIA invented the GPU in 1999.,Our invention of the GPU in 1999 defined moder...,NVDA,2023_10K
3,How does NVIDIA's platform strategy contribute...,NVIDIA's platform strategy brings together har...,"NVIDIA has a platform strategy, bringing toget...",NVDA,2023_10K
4,What does NVIDIA's CUDA programming model enable?,NVIDIA's CUDA programming model opened the par...,With our introduction of the CUDA programming ...,NVDA,2023_10K
...,...,...,...,...,...
6995,What was the interest rate for the 5.400% Seni...,5.400%,The 5.400% Senior Notes due in 2028 have an in...,LVS,2023_10K
6996,What changes were made to the LVSC Revolving C...,The Fourth Amendment to the LVSC Revolving Cre...,"On January 30, 2023, LVSC entered into amendme...",LVS,2023_10K
6997,What was the increase in interest expense for ...,The interest expense increased by $30 million ...,"Following the downgrades, each series of the o...",LVS,2023_10K
6998,What are the new leverage and interest coverag...,"As of January 2024, the new leverage ratio sho...",The amended and restated facility agreement wi...,LVS,2023_10K


In [83]:
data["combined"]=data["question"]+data["answer"]
data["combined"]

Unnamed: 0,combined
0,What area did NVIDIA initially focus on before...
1,What are some of the recent applications of GP...
2,What significant invention did NVIDIA create i...
3,How does NVIDIA's platform strategy contribute...
4,What does NVIDIA's CUDA programming model enab...
...,...
6995,What was the interest rate for the 5.400% Seni...
6996,What changes were made to the LVSC Revolving C...
6997,What was the increase in interest expense for ...
6998,What are the new leverage and interest coverag...


In [85]:
data=data[data["combined"].notnull()]

import regex as re
def cleanup(text):
  cleaned=text.lower()
  cleaned=re.sub(r'[^A-Za-z\s]',"",cleaned)
  return cleaned

data["combined"]=data["combined"].apply(cleanup)
data["combined"]

Unnamed: 0,combined
0,what area did nvidia initially focus on before...
1,what are some of the recent applications of gp...
2,what significant invention did nvidia create i...
3,how does nvidias platform strategy contribute ...
4,what does nvidias cuda programming model enabl...
...,...
6995,what was the interest rate for the senior not...
6996,what changes were made to the lvsc revolving c...
6997,what was the increase in interest expense for ...
6998,what are the new leverage and interest coverag...


In [86]:
data["ticker"]

Unnamed: 0,ticker
0,NVDA
1,NVDA
2,NVDA
3,NVDA
4,NVDA
...,...
6995,LVS
6996,LVS
6997,LVS
6998,LVS


In [90]:
from sklearn.preprocessing import LabelEncoder
# one hot encoding the label - best for data with no order
one_hot = pd.get_dummies(data['ticker'], dtype=int)
# using label encoder
label_encoder = LabelEncoder()
data['ticker_encoded'] = label_encoder.fit_transform(data['ticker'])

data[ ['ticker','ticker_encoded'] ]

Unnamed: 0,ticker,ticker_encoded
0,NVDA,58
1,NVDA,58
2,NVDA,58
3,NVDA,58
4,NVDA,58
...,...,...
6995,LVS,53
6996,LVS,53
6997,LVS,53
6998,LVS,53


In [93]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data["combined"])
y = data["ticker_encoded"]


In [97]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

 # model=SVC() # support vector machine
model=LogisticRegression(max_iter=1000)
model.fit(X_train,y_train)

# Evaluation

In [98]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy: {accuracy}")
report = classification_report(y_test, y_pred)
print(report)

Accuracy: 0.47
              precision    recall  f1-score   support

           0       0.65      0.61      0.63        18
           1       0.86      0.52      0.65        23
           2       0.10      0.31      0.15        13
           3       0.60      0.39      0.47        23
           4       0.16      0.31      0.21        13
           5       0.43      0.38      0.40        24
           6       0.29      0.10      0.15        20
           7       0.33      0.28      0.30        25
           8       0.56      0.22      0.31        23
           9       0.88      0.64      0.74        22
          10       0.36      0.33      0.35        12
          11       0.57      0.19      0.29        21
          12       0.85      0.44      0.58        25
          13       0.94      0.74      0.83        23
          14       0.55      0.73      0.63        15
          15       0.78      0.29      0.42        24
          16       0.18      0.29      0.22        17
          17