# Logistic regression

In [None]:
import pandas as pd 
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
#from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [None]:
df1=pd.read_csv('df4.csv', encoding='ISO-8859-1')
df1.head()

# turn unlabled to -1 *rules of selfTrainClassifier

In [None]:
X = df1["cleaned_text"]
y = df1["sentiment"]

y[y.isna()] = -1
print(y.isna().sum())


In [None]:
y = y.map({
    -1:-1,
    "negative":0,
    "neutral": 1,
    "positive": 2
})

In [None]:
X_labeled = X[y != -1]
y_labeled = y[y != -1]

X_unlabeled = X[y == -1]
y_unlabeled = y[y == -1]

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X_labeled,y_labeled,test_size=0.2)
X_train = pd.concat([X_train,X_unlabeled])
y_train = pd.concat([y_train,y_unlabeled])

# Applying logistic regression

In [None]:
model1 = Pipeline(steps=[
    ("tfidf", TfidfVectorizer()),
    ("model", SelfTrainingClassifier(LogisticRegression(max_iter=200)))
])

In [None]:
model1.fit(X_train, y_train)

In [None]:
y_pred = model1.predict(X_test)

In [None]:
print("Accuracy of logistic regression is :", accuracy_score(y_test, y_pred))

In [None]:
f1 = f1_score(y_test, y_pred, average='weighted')

In [None]:
print("F1 Score of logistic regression is :", f1)

In [None]:
classification_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_rep)

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
pd.DataFrame(conf_matrix)

# Applying logistic regression using Over-sampling with SMOTE

In [None]:
model1 = Pipeline(steps=[
    ("tfidf", TfidfVectorizer()),  
    ("smote", SMOTE()),  
    ("model", SelfTrainingClassifier(LogisticRegression(max_iter=200)))  
])

In [None]:
model1.fit(X_train, y_train)

In [None]:
y_pred = model1.predict(X_test)

In [None]:
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score is : ", f1)

In [None]:
print("Accuracy is : ", accuracy_score(y_test, y_pred))

In [None]:
classification_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_rep)

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
pd.DataFrame(conf_matrix)

In [None]:
joblib.dump(model1, 'BestModel.joblib')