In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from nltk.tokenize import word_tokenize
import re

In [2]:
train_data = pd.read_csv('hi-train.csv', names=['label', 'text'])
test_data = pd.read_csv('hi-test.csv', names=['label', 'text'])
valid_data = pd.read_csv('hi-valid.csv', names=['label', 'text'])
hindi_stopwords = pd.read_csv('stopwords.txt', delimiter='\t', names=['stopwords'])

In [3]:
train_data.head()


Unnamed: 0,label,text
0,neutral,"निर्माता :\nशीतल विनोद तलवार, मधु‍ मैंटेना\n\n..."
1,positive,’उड़ान’ से विक्रमादित्य\nमोटवाने\nने अच्छे सिन...
2,neutral,फिल्म में गानों के दृश्य में अनुष्का को माइक क...
3,neutral,फांसी चढ़ने से पहले वह पाकिस्तान के सदर से खास...
4,positive,राज कुमार हिरानी ने इस मूवी में भी अपने डायरेक...


In [4]:
# Step 2: Map Labels to Scores
label_mapping = {'positive': 1, 'neutral': 0, 'negative': -1}
train_data['label'] = train_data['label'].map(label_mapping)
test_data['label'] = test_data['label'].map(label_mapping)
valid_data['label'] = valid_data['label'].map(label_mapping)

In [5]:
train_data.head()

Unnamed: 0,label,text
0,0,"निर्माता :\nशीतल विनोद तलवार, मधु‍ मैंटेना\n\n..."
1,1,’उड़ान’ से विक्रमादित्य\nमोटवाने\nने अच्छे सिन...
2,0,फिल्म में गानों के दृश्य में अनुष्का को माइक क...
3,0,फांसी चढ़ने से पहले वह पाकिस्तान के सदर से खास...
4,1,राज कुमार हिरानी ने इस मूवी में भी अपने डायरेक...


In [6]:
# Preprocess stopwords (tokenize, remove spaces, lowercase, etc.)
def preprocess_stopwords(stopword_list):
    processed_stopwords = []
    for stopword in stopword_list:
        tokenized = word_tokenize(stopword)  # Tokenizing
        cleaned = [re.sub(r'\W+', '', word.lower()) for word in tokenized]  # Lowercase and remove punctuation
        processed_stopwords.extend(cleaned)
    return processed_stopwords

# Apply preprocessing to the stopwords
processed_stopwords = preprocess_stopwords(hindi_stopwords['stopwords'])

In [7]:
# Step 3: TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words=hindi_stopwords['stopwords'].tolist(), max_features=10000)

# Fit and transform the text data
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['text'])
X_test_tfidf = tfidf_vectorizer.transform(test_data['text'])
X_valid_tfidf = tfidf_vectorizer.transform(valid_data['text'])

y_train = train_data['label']
y_test = test_data['label']
y_valid = valid_data['label']



In [8]:
y_train.head()

0    0
1    1
2    0
3    0
4    1
Name: label, dtype: int64

In [9]:
# Step 4: Initialize the Regressor
regressor = RandomForestRegressor(n_estimators=200, random_state=42)

# Train the regressor
regressor.fit(X_train_tfidf, y_train)

In [10]:
sentiment_scores = regressor.predict(X_test_tfidf)

In [11]:
# Step 5: Evaluate the Model
mse = mean_squared_error(y_test, sentiment_scores)
mae = mean_absolute_error(y_test, sentiment_scores)
r2 = r2_score(y_test, sentiment_scores)

In [12]:
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared score: {r2}')

Mean Squared Error: 0.595678550194144
Mean Absolute Error: 0.6344951227820114
R-squared score: 0.15348532069003284


In [13]:
# Step 6: (Optional) Converting Scores to Labels for Classification Accuracy
predicted_labels = ['Positive' if score > 0.5 else 'Negative' if score < -0.5 else 'Neutral' for score in sentiment_scores]

In [15]:
# Compare predicted labels to actual labels
accuracy = np.mean([pred == actual for pred, actual in zip(predicted_labels, test_data['label'])])
print(f'Classification Accuracy: {accuracy}')

Classification Accuracy: 0.0


In [16]:
# Step 7: (Optional) Hyperparameter Tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [17]:
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42), param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_tfidf, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [18]:
# Best model parameters
print("Best parameters found: ", grid_search.best_params_)

Best parameters found:  {'max_depth': 30, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}


In [19]:

# Use the best estimator to predict
best_regressor = grid_search.best_estimator_
best_sentiment_scores = best_regressor.predict(X_test_tfidf)


In [26]:
# Evaluate the tuned model
best_mse = mean_squared_error(y_test, best_sentiment_scores)
best_r2 = r2_score(y_test, best_sentiment_scores)

print(f'Best Model Mean Squared Error: {best_mse}')
print(f'Best Model R-squared score: {best_r2}')

Best Model Mean Squared Error: 0.600558351260025
Best Model R-squared score: 0.14655066905110037


In [34]:
# Saving the model

joblib.dump(best_model, 'best_sentiment_model.pkl')
print("Model and vectorizer saved successfully!")

NameError: name 'best_model' is not defined

In [31]:
# Load the model from the pickle file
loaded_model = joblib.load('sentiment_model.pkl')

# You can now use `loaded_model` to make predictions or evaluate it further
loaded_model_predictions = loaded_model.predict(X_test)

NameError: name 'X_test' is not defined

In [28]:
import joblib