In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
 df  = pd.read_csv(r"badminton")

In [9]:
df.drop(columns = 'Unnamed: 0',inplace = True)

In [10]:
df.head()

Unnamed: 0,Review text,Sentiment
0,nice product good quality but price rising bad...,Positive
1,didn t supplied yonex mavis outside cover wa y...,Negative
2,worst product damaged shuttlecock packed new b...,Negative
3,quite o k but nowadays quality cork like not y...,Neutral
4,pricedjust retailer didn t understand wat adva...,Negative


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

In [11]:
tfidf = TfidfVectorizer(max_features=5000)

In [12]:
import warnings
warnings.filterwarnings('ignore')
import time
import joblib
import os

In [13]:
x_train, x_test, y_train, y_test = train_test_split(df['Review text'] , df['Sentiment'] , test_size=0.2, random_state=42)

In [14]:
import mlflow
mlflow.set_experiment("Review_analysis")

<Experiment: artifact_location='file:///C:/Users/Asus/mlruns/721000997466972747', creation_time=1711966005346, experiment_id='721000997466972747', last_update_time=1711966005346, lifecycle_stage='active', name='Review_analysis', tags={}>

In [15]:
pipelines = {
    'Decision trees':Pipeline([
        ('vectorization',CountVectorizer()),
        ('classifier',DecisionTreeClassifier())
    ]),
    'Random forest':Pipeline([
        ('vectorization',CountVectorizer()),
        ('classifier',RandomForestClassifier())
    ]),
    'logistic regression':Pipeline([
        ('vectorization',CountVectorizer()),
        ('classifier',LogisticRegression())
    ]),
    'knn':Pipeline([
        ('vectorization',CountVectorizer()),
        ('classifier',KNeighborsClassifier())
    ])
}

param_grids = {
    'Decision trees': [
        {
            'vectorization': [CountVectorizer(), tfidf],
            'vectorization__max_features' : [1000, 1500, 2000, 5000],
            'classifier__max_depth': [None, 5, 10]
        }],
    'Random forest':[
        {
            'vectorization': [CountVectorizer(), tfidf],
            'classifier__n_estimators': [50, 100, 200]
        }
    ],
    'logistic regression':[
        {
            'vectorization': [CountVectorizer(), tfidf],
            'vectorization__max_features' : [1000, 1500, 2000, 5000], 
            'classifier__C': [0.1, 1, 10], 
            'classifier__l1_ratio': [0.4, 0.5, 0.6],
            'classifier__solver': ['saga'],
            'classifier__class_weight': ['balanced']
            
        }
    ],
    'knn':[
        {
            'vectorization': [CountVectorizer(), tfidf],
            'classifier__n_neighbors' : [i for i in range(3, 21, 2)], 
            'classifier__p' : [1, 2, 3]
        }
    ]
}


In [17]:
best_models = {}

# Run the Pipeline
for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='accuracy', 
                               return_train_score=True,
                               verbose=1
                              )
    
    mlflow.sklearn.autolog(max_tuning_runs=None)
    
    with mlflow.start_run() as run:
        %time grid_search.fit(x_train, y_train)
        
    print('Train Score: ', grid_search.best_score_)
    print('Test Score: ', grid_search.score(x_test, y_test))
    
    best_models[algo] = grid_search.best_estimator_
    print()

********** Decision trees **********




Fitting 5 folds for each of 24 candidates, totalling 120 fits
CPU times: total: 52.1 s
Wall time: 2min 5s
Train Score:  0.8707024182141273
Test Score:  0.8707403055229143

********** Random forest **********




Fitting 5 folds for each of 6 candidates, totalling 30 fits
CPU times: total: 6min 30s
Wall time: 10min 20s
Train Score:  0.87951849346328
Test Score:  0.8795534665099882

********** logistic regression **********




Fitting 5 folds for each of 72 candidates, totalling 360 fits
CPU times: total: 2min 16s
Wall time: 3min 58s
Train Score:  0.8339709831567659
Test Score:  0.836075205640423

********** knn **********




Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: total: 17min 7s
Wall time: 6min 13s
Train Score:  0.8654134851608852
Test Score:  0.8666274970622797

