In [2]:
# Sentiment Analysis on Movie Reviews

## Import Required Libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Step 1: Load and Explore the Data

# Replace 'train_path' and 'test_path' with your actual file paths for the training and test datasets
train_df = pd.read_csv('train.tsv.zip', sep='\t')
test_df = pd.read_csv('test.tsv.zip', sep='\t')

# Show the first few records of the training data
print("First few records of the training data:")
print(train_df.head())

# Show the first few records of the test data
print("First few records of the test data:")
print(test_df.head())

## Step 2: Handle Null Values

# Check for null values in the training and test data
print("Null values in training data:", train_df.isnull().sum())
print("Null values in test data:", test_df.isnull().sum())

# Since there are no null values, we don't need to handle them in this case.

## Step 3: Inspect Data Types

# Check the data types of the columns in training and test data
print("Data types in training data:", train_df.dtypes)
print("Data types in test data:", test_df.dtypes)

## Step 4: Text Preprocessing

# For simplicity, we'll move forward without additional text preprocessing steps like stemming or lemmatization.

## Step 5: Feature Extraction

# Initialize CountVectorizer to convert text into bag-of-words representation
vectorizer = CountVectorizer(stop_words='english', min_df=2)

# Fit and transform the training data
X_train_counts = vectorizer.fit_transform(train_df['Phrase'])

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_counts, train_df['Sentiment'], test_size=0.2, random_state=42)

## Step 6: Build the Logistic Regression Model

# Initialize the Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Fit the model on the training data
log_reg.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = log_reg.predict(X_val)

# Evaluate the model's performance
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Model Accuracy on Validation Set: {accuracy}")

## Step 7: Hyperparameter Tuning

# Define the hyperparameters and their possible values
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2']
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(LogisticRegression(max_iter=1000, random_state=42), param_grid, cv=3)

# Perform Grid Search on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters from Grid Search
best_params = grid_search.best_params_
print(f"Best Parameters from Grid Search: {best_params}")

## Step 8: Evaluate the Optimized Model

# Initialize the Logistic Regression model with the best hyperparameters
log_reg_optimized = LogisticRegression(C=best_params['C'], penalty=best_params['penalty'], max_iter=1000, random_state=42)

# Fit the optimized model on the training data
log_reg_optimized.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred_optimized = log_reg_optimized.predict(X_val)

# Evaluate the optimized model's performance
accuracy_optimized = accuracy_score(y_val, y_val_pred_optimized)
print(f"Optimized Model Accuracy on Validation Set: {accuracy_optimized}")


First few records of the training data:
   PhraseId  SentenceId                                             Phrase   
0         1           1  A series of escapades demonstrating the adage ...  \
1         2           1  A series of escapades demonstrating the adage ...   
2         3           1                                           A series   
3         4           1                                                  A   
4         5           1                                             series   

   Sentiment  
0          1  
1          2  
2          2  
3          2  
4          2  
First few records of the test data:
   PhraseId  SentenceId                                             Phrase
0    156061        8545  An intermittently pleasing but mostly routine ...
1    156062        8545  An intermittently pleasing but mostly routine ...
2    156063        8545                                                 An
3    156064        8545  intermittently pleasing but mostly routi

12 fits failed out of a total of 24.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/danmarino/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/danmarino/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/danmarino/Library/Python/3.9/lib/python/site-packages/sklearn/linear_model/_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/danmarino/Library/Python/3.9/lib/pyt

Best Parameters from Grid Search: {'C': 1, 'penalty': 'l2'}
Optimized Model Accuracy on Validation Set: 0.6426694860950917
