In [2]:
import chardet

with open('train.csv', 'rb') as f:
    result = chardet.detect(f.read())
    encoding = result['encoding']

print(f"Detected encoding: {encoding}")


Detected encoding: Windows-1254


In [4]:
import pandas as pd

input_file = 'train.csv'
output_file = 'train_utf8.csv'

# Read the file in binary mode
with open(input_file, 'rb') as f:
    content = f.read()

# Decode using the detected encoding
decoded_content = content.decode('Windows-1254', errors='replace')

# Write the decoded content to a new file with UTF-8 encoding
with open(output_file, 'w', encoding='utf-8') as f:
    f.write(decoded_content)

# Load the new file with UTF-8 encoding
df = pd.read_csv(output_file, encoding='utf-8')

# Print first few rows to verify
print(df.head())


   Item ID  Sentiment                                      SentimentText
0        1          0  @RailMinIndia My PNR is 8348062961, I am in wa...
1        2          0  @sureshpprabhu @RailMinIndia AC not working in...
2        3          0  @RailMinIndia I'm traveling to chennai by trai...
3        4          5  @RailMinIndia irctc is not responding at the t...
4        5          7  @DRMbhopal @RailMinIndia @sanjaygupta2012 @drm...


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Load the UTF-8 encoded data
df = pd.read_csv('train_utf8.csv', encoding='utf-8')

# Prepare features and target
X = df['SentimentText']
y = df['Sentiment']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train the model
model = SVC()
model.fit(X_train_vectorized, y_train)

# Evaluate the model
y_pred = model.predict(X_test_vectorized)
print(classification_report(y_test, y_pred))

# Function to predict priority for new complaints
def predict_priority(new_complaint):
    vectorized_complaint = vectorizer.transform([new_complaint])
    return model.predict(vectorized_complaint)[0]

# Example usage
new_complaint = "Train is delayed by 2 hours"
print(f"Predicted priority: {predict_priority(new_complaint)}")


              precision    recall  f1-score   support

           0       0.74      0.93      0.82        57
           1       1.00      0.19      0.32        16
           2       0.00      0.00      0.00         4
           3       1.00      0.61      0.76        23
           4       0.87      0.87      0.87        39
           5       0.70      0.94      0.80        70
           6       0.67      0.59      0.62        41
           7       0.88      0.58      0.70        24

    accuracy                           0.76       274
   macro avg       0.73      0.59      0.61       274
weighted avg       0.78      0.76      0.74       274

Predicted priority: 4


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

# Load the data
df = pd.read_csv('train_utf8.csv', encoding='utf-8')

# Prepare features and target
X = df['SentimentText']
y = df['Sentiment']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with TfidfVectorizer and MultinomialNB
model_nb = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Train the model
model_nb.fit(X_train, y_train)

# Evaluate the model
y_pred_nb = model_nb.predict(X_test)
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))


Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.98      0.76        57
           1       1.00      0.06      0.12        16
           2       0.00      0.00      0.00         4
           3       1.00      0.30      0.47        23
           4       0.79      0.69      0.74        39
           5       0.59      0.97      0.73        70
           6       0.82      0.44      0.57        41
           7       1.00      0.17      0.29        24

    accuracy                           0.66       274
   macro avg       0.73      0.45      0.46       274
weighted avg       0.74      0.66      0.61       274



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

# Load the data
df = pd.read_csv('train_utf8.csv', encoding='utf-8')

# Prepare features and target
X = df['SentimentText']
y = df['Sentiment']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with TfidfVectorizer and LogisticRegression
model_lr = make_pipeline(TfidfVectorizer(), LogisticRegression(max_iter=1000))

# Train the model
model_lr.fit(X_train, y_train)

# Evaluate the model
y_pred_lr = model_lr.predict(X_test)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.95      0.84        57
           1       1.00      0.31      0.48        16
           2       0.00      0.00      0.00         4
           3       1.00      0.65      0.79        23
           4       0.85      0.85      0.85        39
           5       0.73      0.94      0.83        70
           6       0.67      0.59      0.62        41
           7       0.82      0.58      0.68        24

    accuracy                           0.77       274
   macro avg       0.73      0.61      0.64       274
weighted avg       0.78      0.77      0.75       274



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load dataset
df = pd.read_csv('train_utf8.csv', encoding='utf-8')

# Feature and target split
X = df.drop('SentimentText', axis=1)  # Replace 'target_column' with the actual target column
y = df['Sentiment']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression Model with Cross-Validation
log_reg = LogisticRegression()

# Cross-validation
log_reg_scores = cross_val_score(log_reg, X_train_scaled, y_train, cv=5, scoring='accuracy')
print(f"Logistic Regression Cross-Validation Accuracy: {log_reg_scores.mean()}")

# Hyperparameter Tuning using GridSearchCV
log_reg_params = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}

grid_log_reg = GridSearchCV(log_reg, log_reg_params, cv=5, scoring='accuracy')
grid_log_reg.fit(X_train_scaled, y_train)

# Best parameters and performance
print(f"Best parameters for Logistic Regression: {grid_log_reg.best_params_}")
print(f"Best cross-validation score for Logistic Regression: {grid_log_reg.best_score_}")

# Evaluate on test data
y_pred_log_reg = grid_log_reg.predict(X_test_scaled)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_log_reg))


Logistic Regression Cross-Validation Accuracy: 0.908993237347295
Best parameters for Logistic Regression: {'C': 100, 'solver': 'lbfgs'}
Best cross-validation score for Logistic Regression: 1.0
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        86
           1       1.00      1.00      1.00        26
           2       1.00      1.00      1.00         6
           3       1.00      1.00      1.00        45
           4       1.00      1.00      1.00        62
           5       1.00      1.00      1.00        97
           6       1.00      1.00      1.00        58
           7       1.00      1.00      1.00        30

    accuracy                           1.00       410
   macro avg       1.00      1.00      1.00       410
weighted avg       1.00      1.00      1.00       410



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [2]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Load dataset
df = pd.read_csv('train_utf8.csv', encoding='utf-8')

# Feature and target split
X = df.drop('SentimentText', axis=1)  # Replace 'target_column' with the actual target column
y = df['Sentiment']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# SVM Model with scaling
svm = make_pipeline(StandardScaler(), SVC())

# Cross-validation
svm_scores = cross_val_score(svm, X_train, y_train, cv=5, scoring='accuracy')
print(f"SVM Cross-Validation Accuracy: {svm_scores.mean()}")

# Hyperparameter Tuning using GridSearchCV
svm_params = {
    'svc__C': [0.1, 1, 10, 100],
    'svc__kernel': ['linear', 'rbf', 'poly'],
    'svc__gamma': ['scale', 'auto']
}

grid_svm = GridSearchCV(svm, svm_params, cv=5, scoring='accuracy')
grid_svm.fit(X_train, y_train)

# Best parameters and performance
print(f"Best parameters for SVM: {grid_svm.best_params_}")
print(f"Best cross-validation score for SVM: {grid_svm.best_score_}")

# Evaluate on test data
y_pred_svm = grid_svm.predict(X_test)
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))


SVM Cross-Validation Accuracy: 0.9989583333333332
Best parameters for SVM: {'svc__C': 1, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}
Best cross-validation score for SVM: 1.0
SVM Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        86
           1       1.00      1.00      1.00        26
           2       1.00      1.00      1.00         6
           3       1.00      1.00      1.00        45
           4       1.00      1.00      1.00        62
           5       1.00      1.00      1.00        97
           6       1.00      1.00      1.00        58
           7       1.00      1.00      1.00        30

    accuracy                           1.00       410
   macro avg       1.00      1.00      1.00       410
weighted avg       1.00      1.00      1.00       410



In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
df = pd.read_csv('train_utf8.csv')

# Define features (X) and target (y)
X = df['SentimentText']
y = df['Sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)

# Define a function to perform the model pipeline and cross-validation

def evaluate_model(model, param_grid):
    pipeline = Pipeline([
        ('tfidf', tfidf),
        ('model', model)
    ])
    
    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=2)
    grid_search.fit(X_train, y_train)
    
    # Output best parameters and scores
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy: {grid_search.best_score_}")
    
    # Test set evaluation
    y_pred = grid_search.predict(X_test)
    print("\nTest set evaluation:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
    
    return grid_search

# 1. Logistic Regression
log_reg = LogisticRegression(max_iter=500)
log_reg_params = {
    'model__C': [0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'model__solver': ['lbfgs', 'liblinear']  # Different solvers
}

print("Evaluating Logistic Regression...")
best_log_reg = evaluate_model(log_reg, log_reg_params)

# 2. Support Vector Machine (SVM)
svm = SVC()
svm_params = {
    'model__C': [0.01, 0.1, 1, 10],  # Regularization
    'model__kernel': ['linear', 'rbf'],  # Kernel type
    'model__gamma': ['scale', 'auto']  # Kernel coefficient
}

print("\nEvaluating SVM...")
best_svm = evaluate_model(svm, svm_params)



Evaluating Logistic Regression...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END .................model__C=0.01, model__solver=lbfgs; total time=   0.0s
[CV] END .................model__C=0.01, model__solver=lbfgs; total time=   0.0s
[CV] END .................model__C=0.01, model__solver=lbfgs; total time=   0.0s
[CV] END .................model__C=0.01, model__solver=lbfgs; total time=   0.0s
[CV] END .................model__C=0.01, model__solver=lbfgs; total time=   0.0s
[CV] END .............model__C=0.01, model__solver=liblinear; total time=   0.0s
[CV] END .............model__C=0.01, model__solver=liblinear; total time=   0.0s
[CV] END .............model__C=0.01, model__solver=liblinear; total time=   0.0s
[CV] END .............model__C=0.01, model__solver=liblinear; total time=   0.0s
[CV] END .............model__C=0.01, model__solver=liblinear; total time=   0.0s
[CV] END ..................model__C=0.1, model__solver=lbfgs; total time=   0.0s
[CV] END .....

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import FunctionTransformer
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('wordnet')

# Load the dataset
df = pd.read_csv('train_utf8.csv')

# Define features (X) and target (y)
X = df['SentimentText']
y = df['Sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocessing: Lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

# Lemmatization transformer
lemmatizer_transformer = FunctionTransformer(lambda X: [lemmatize_text(doc) for doc in X], validate=False)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=2000, ngram_range=(1, 2))  # Bi-grams added

# Define a function to perform the model pipeline and cross-validation

def evaluate_model(model, param_grid):
    pipeline = Pipeline([
        ('lemmatizer', lemmatizer_transformer),
        ('tfidf', tfidf),
        ('model', model)
    ])
    
    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=2)
    grid_search.fit(X_train, y_train)
    
    # Output best parameters and scores
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy: {grid_search.best_score_}")
    
    # Test set evaluation
    y_pred = grid_search.predict(X_test)
    print("\nTest set evaluation:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
    
    return grid_search

# 1. Logistic Regression with expanded parameter grid
log_reg = LogisticRegression(max_iter=500)
log_reg_params = {
    'model__C': [0.01, 0.1, 1, 10, 100, 1000],  # Expanded Regularization parameter search
    'model__solver': ['lbfgs', 'liblinear'],
    'model__penalty': ['l2']  # Keep using L2 penalty
}

print("Evaluating Logistic Regression...")
best_log_reg = evaluate_model(log_reg, log_reg_params)

# 2. Support Vector Machine (SVM) with expanded search space
svm = SVC()
svm_params = {
    'model__C': [0.01, 0.1, 1, 10, 100],  # Regularization
    'model__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # More kernel options
    'model__gamma': ['scale', 'auto'],  # Kernel coefficient
    'model__degree': [2, 3, 4]  # For polynomial kernel
}

print("\nEvaluating SVM...")
best_svm = evaluate_model(svm, svm_params)


[nltk_data] Downloading package wordnet to C:\Users\DARREN
[nltk_data]     CHAHAL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Evaluating Logistic Regression...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END model__C=0.01, model__penalty=l2, model__solver=lbfgs; total time=   1.3s
[CV] END model__C=0.01, model__penalty=l2, model__solver=lbfgs; total time=   0.1s
[CV] END model__C=0.01, model__penalty=l2, model__solver=lbfgs; total time=   0.1s
[CV] END model__C=0.01, model__penalty=l2, model__solver=lbfgs; total time=   0.1s
[CV] END model__C=0.01, model__penalty=l2, model__solver=lbfgs; total time=   0.0s
[CV] END model__C=0.01, model__penalty=l2, model__solver=liblinear; total time=   0.0s
[CV] END model__C=0.01, model__penalty=l2, model__solver=liblinear; total time=   0.0s
[CV] END model__C=0.01, model__penalty=l2, model__solver=liblinear; total time=   0.0s
[CV] END model__C=0.01, model__penalty=l2, model__solver=liblinear; total time=   0.0s
[CV] END model__C=0.01, model__penalty=l2, model__solver=liblinear; total time=   0.0s
[CV] END model__C=0.1, model__penalty=l2, model__solver

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier

# Load the dataset
df = pd.read_csv('train_utf8.csv')

# Define X and y
X = df['SentimentText']
y = df['Sentiment']

# TF-IDF Vectorizer
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_tfidf = tfidf.fit_transform(X_train)
X_train_res, y_train_res = smote.fit_resample(X_train_tfidf, y_train)

# Create Pipelines for Logistic Regression and SVM
log_reg = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000)),
    ('logreg', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

svm = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000)),
    ('svc', SVC(class_weight='balanced'))
])

# Parameter Grids for GridSearch
param_grid_logreg = {
    'logreg__C': [0.1, 1, 10, 100],
    'logreg__solver': ['lbfgs', 'liblinear']
}

param_grid_svm = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf'],
    'svc__gamma': ['scale', 'auto']
}

# GridSearchCV for Logistic Regression
grid_logreg = GridSearchCV(log_reg, param_grid_logreg, cv=5, scoring='accuracy', n_jobs=-1)
grid_logreg.fit(X_train, y_train)

# GridSearchCV for SVM
grid_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='accuracy', n_jobs=-1)
grid_svm.fit(X_train, y_train)

# Evaluate Logistic Regression
y_pred_logreg = grid_logreg.predict(X_test)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logreg))
print(f"Logistic Regression Best Parameters: {grid_logreg.best_params_}")
print(f"Logistic Regression Best Cross-Validation Score: {grid_logreg.best_score_}")
print(f"Logistic Regression Test Accuracy: {accuracy_score(y_test, y_pred_logreg)}")

# Evaluate SVM
y_pred_svm = grid_svm.predict(X_test)
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))
print(f"SVM Best Parameters: {grid_svm.best_params_}")
print(f"SVM Best Cross-Validation Score: {grid_svm.best_score_}")
print(f"SVM Test Accuracy: {accuracy_score(y_test, y_pred_svm)}")

# Voting Classifier (Ensemble of Logistic Regression and SVM)
voting_clf = VotingClassifier(estimators=[
    ('logreg', grid_logreg.best_estimator_),
    ('svm', grid_svm.best_estimator_)
], voting='soft')

voting_clf.fit(X_train, y_train)

# Evaluate Voting Classifier
y_pred_voting = voting_clf.predict(X_test)
print("Voting Classifier Classification Report:")
print(classification_report(y_test, y_pred_voting))
print(f"Voting Classifier Test Accuracy: {accuracy_score(y_test, y_pred_voting)}")


ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (c:\ProgramData\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py)

Final working Code

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import FunctionTransformer
from nltk.stem import WordNetLemmatizer
import nltk
import re

nltk.download('wordnet')

# Load the dataset
df = pd.read_csv('train_utf8.csv')

# Define features (X) and target (y)
X = df['SentimentText']
y = df['Sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def clean_and_lemmatize_text(text):
    # Remove special characters and unwanted tokens (e.g., @RailMinIndia)
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special characters
    text = ' '.join(word for word in text.split() if not word.startswith('@'))  # Remove tokens starting with '@'
    
    # Convert to lowercase and lemmatize
    text = text.lower()
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

# Lemmatization and cleaning transformer
clean_lemmatizer_transformer = FunctionTransformer(lambda X: [clean_and_lemmatize_text(doc) for doc in X], validate=False)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=2000, ngram_range=(1, 3))  # Bi-grams added

# Define a function to perform the model pipeline and cross-validation
def evaluate_model(model, param_grid):
    pipeline = Pipeline([
        ('clean_lemmatizer', clean_lemmatizer_transformer),
        ('tfidf', tfidf),
        ('model', model)
    ])
    
    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=2)
    grid_search.fit(X_train, y_train)
    
    # Output best parameters and scores
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy: {grid_search.best_score_}")
    
    # Test set evaluation
    y_pred = grid_search.predict(X_test)
    print("\nTest set evaluation:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
    
    return grid_search


# 1. Support Vector Machine (SVM) with expanded search space
svm = SVC()
svm_params = {
    'model__C': [0.01, 0.1, 1, 10, 100],  # Regularization
    'model__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # More kernel options
    'model__gamma': ['scale', 'auto'],  # Kernel coefficient
    'model__degree': [2, 3, 4]  # For polynomial kernel
}

print("\nEvaluating SVM...")
best_svm = evaluate_model(svm, svm_params)

[nltk_data] Downloading package wordnet to C:\Users\DARREN
[nltk_data]     CHAHAL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Evaluating SVM...
Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=linear; total time=   1.3s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=linear; total time=   0.2s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=linear; total time=   0.2s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=linear; total time=   0.1s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=linear; total time=   0.1s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=rbf; total time=   0.2s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=rbf; total time=   0.2s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=rbf; total time=   0.2s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=rbf; total time=   0.2s
[CV] END mod

Creating pkl file for Model

In [1]:
import joblib


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import FunctionTransformer
from nltk.stem import WordNetLemmatizer
import nltk
import re
import joblib

nltk.download('wordnet')

# Load the dataset
df = pd.read_csv('train_utf8.csv')

# Define features (X) and target (y)
X = df['SentimentText']
y = df['Sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def clean_and_lemmatize_text_single(text):
    # Remove special characters and unwanted tokens (e.g., @RailMinIndia)
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special characters
    text = ' '.join(word for word in text.split() if not word.startswith('@'))  # Remove tokens starting with '@'
    
    # Convert to lowercase and lemmatize
    text = text.lower()
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

def clean_and_lemmatize_text(texts):
    return [clean_and_lemmatize_text_single(doc) for doc in texts]

# Lemmatization and cleaning transformer
clean_lemmatizer_transformer = FunctionTransformer(clean_and_lemmatize_text, validate=False)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=2000, ngram_range=(1, 3))  # Bi-grams added

# Define a function to perform the model pipeline and cross-validation
def evaluate_model(model, param_grid):
    pipeline = Pipeline([
        ('clean_lemmatizer', clean_lemmatizer_transformer),
        ('tfidf', tfidf),
        ('model', model)
    ])
    
    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=2)
    grid_search.fit(X_train, y_train)
    
    # Output best parameters and scores
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy: {grid_search.best_score_}")
    
    # Test set evaluation
    y_pred = grid_search.predict(X_test)
    print("\nTest set evaluation:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
    
    # Save the best model
    joblib.dump(grid_search.best_estimator_, 'best_model.pkl')
    print("Model saved to 'best_model.pkl'")
    
    return grid_search

# 1. Support Vector Machine (SVM) with expanded search space
svm = SVC()
svm_params = {
    'model__C': [0.01, 0.1, 1, 10, 100],  # Regularization
    'model__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # More kernel options
    'model__gamma': ['scale', 'auto'],  # Kernel coefficient
    'model__degree': [2, 3, 4]  # For polynomial kernel
}

print("\nEvaluating SVM...")
best_svm = evaluate_model(svm, svm_params)


[nltk_data] Downloading package wordnet to C:\Users\DARREN
[nltk_data]     CHAHAL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Evaluating SVM...
Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=linear; total time=   0.2s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=linear; total time=   0.2s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=linear; total time=   0.2s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=linear; total time=   0.2s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=linear; total time=   0.2s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=rbf; total time=   0.2s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=rbf; total time=   0.2s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=rbf; total time=   0.2s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=rbf; total time=   0.2s
[CV] END mod

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import FunctionTransformer
import joblib
from text_cleaner import clean_and_lemmatize_text  # Import from your module

# Load the dataset
df = pd.read_csv('train_utf8.csv')

# Define features (X) and target (y)
X = df['SentimentText']
y = df['Sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Lemmatization and cleaning transformer
clean_lemmatizer_transformer = FunctionTransformer(clean_and_lemmatize_text, validate=False)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=2000, ngram_range=(1, 3))  # Bi-grams added

# Define a function to perform the model pipeline and cross-validation
def evaluate_model(model, param_grid):
    pipeline = Pipeline([
        ('clean_lemmatizer', clean_lemmatizer_transformer),
        ('tfidf', tfidf),
        ('model', model)
    ])
    
    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=2)
    grid_search.fit(X_train, y_train)
    
    # Output best parameters and scores
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy: {grid_search.best_score_}")
    
    # Test set evaluation
    y_pred = grid_search.predict(X_test)
    print("\nTest set evaluation:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
    
    # Save the best model
    joblib.dump(grid_search.best_estimator_, 'best_model1.pkl')
    print("Model saved to 'best_model1.pkl'")
    
    return grid_search

# 1. Support Vector Machine (SVM) with expanded search space
svm = SVC()
svm_params = {
    'model__C': [0.01, 0.1, 1, 10, 100],  # Regularization
    'model__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # More kernel options
    'model__gamma': ['scale', 'auto'],  # Kernel coefficient
    'model__degree': [2, 3, 4]  # For polynomial kernel
}

print("\nEvaluating SVM...")
best_svm = evaluate_model(svm, svm_params)



Evaluating SVM...
Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=linear; total time=   1.5s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=linear; total time=   0.2s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=linear; total time=   0.2s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=linear; total time=   0.2s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=linear; total time=   0.2s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=rbf; total time=   0.2s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=rbf; total time=   0.2s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=rbf; total time=   0.2s
[CV] END model__C=0.01, model__degree=2, model__gamma=scale, model__kernel=rbf; total time=   0.2s
[CV] END mod

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import FunctionTransformer
from nltk.stem import WordNetLemmatizer
import nltk
import re
import joblib

nltk.download('wordnet')

# Load the dataset
df = pd.read_csv('train_utf8.csv')

# Define features (X) and target (y)
X = df['SentimentText']
y = df['Sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def clean_and_lemmatize_text_single(text):
    # Remove special characters and unwanted tokens (e.g., @RailMinIndia)
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special characters
    text = ' '.join(word for word in text.split() if not word.startswith('@'))  # Remove tokens starting with '@'
    
    # Convert to lowercase and lemmatize
    text = text.lower()
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

def clean_and_lemmatize_text(texts):
    return [clean_and_lemmatize_text_single(doc) for doc in texts]

# Lemmatization and cleaning transformer
clean_lemmatizer_transformer = FunctionTransformer(clean_and_lemmatize_text, validate=False)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=2000, ngram_range=(1, 3))  # Bi-grams added

# Define a function to perform the model pipeline and cross-validation
def evaluate_model(model, param_grid):
    pipeline = Pipeline([
        ('clean_lemmatizer', clean_lemmatizer_transformer),
        ('tfidf', tfidf),
        ('model', model)
    ])
    
    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=2)
    grid_search.fit(X_train, y_train)
    
    # Output best parameters and scores
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy: {grid_search.best_score_}")
    
    # Test set evaluation
    y_pred = grid_search.predict(X_test)
    print("\nTest set evaluation:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
    
    # Save the best model
    joblib.dump(grid_search.best_estimator_, 'best_model.pkl')
    print("Model saved to 'best_model.pkl'")
    
    return grid_search

[nltk_data] Downloading package wordnet to C:\Users\DARREN
[nltk_data]     CHAHAL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [1]:
import nltk
nltk.download('wordnet', download_dir='nltk_data')


[nltk_data] Downloading package wordnet to nltk_data...


True