In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [2]:
train_df = pd.read_csv('train.tsv.zip', sep='\t')
test_df = pd.read_csv('test.tsv.zip', sep='\t')

print("First few records of the training data:")
train_df.head()

First few records of the training data:


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [3]:
print("First few records of the test data:")
test_df.head()


First few records of the test data:


Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [4]:
print("Null values in training data:", train_df.isnull().sum())
print("Null values in test data:", test_df.isnull().sum())

Null values in training data: PhraseId      0
SentenceId    0
Phrase        0
Sentiment     0
dtype: int64
Null values in test data: PhraseId      0
SentenceId    0
Phrase        1
dtype: int64


In [5]:
print("Data types in training data:", train_df.dtypes)
print("Data types in test data:", test_df.dtypes)


Data types in training data: PhraseId       int64
SentenceId     int64
Phrase        object
Sentiment      int64
dtype: object
Data types in test data: PhraseId       int64
SentenceId     int64
Phrase        object
dtype: object


In [6]:
vectorizer = CountVectorizer(stop_words='english', min_df=2)
X_train_counts = vectorizer.fit_transform(train_df['Phrase'])
X_train, X_val, y_train, y_val = train_test_split(X_train_counts, train_df['Sentiment'], test_size=0.2, random_state=42)


## Build the Logistic Regression Model

In [7]:
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)
y_val_pred = log_reg.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Model Accuracy on Validation Set: {accuracy}")

Model Accuracy on Validation Set: 0.6426694860950917


## Hyperparameter tuning

In [8]:
param_grid = {'C': [0.01, 0.1, 1, 10], 'penalty': ['l1', 'l2']}
grid_search = GridSearchCV(LogisticRegression(max_iter=1000, random_state=42), param_grid, cv=3)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print(f"Best Parameters from Grid Search: {best_params}")

12 fits failed out of a total of 24.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/danmarino/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/danmarino/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/danmarino/Library/Python/3.9/lib/python/site-packages/sklearn/linear_model/_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/danmarino/Library/Python/3.9/lib/pyt

Best Parameters from Grid Search: {'C': 1, 'penalty': 'l2'}


## Evaluate the model on the test set

In [9]:
log_reg_optimized = LogisticRegression(C=best_params['C'], penalty=best_params['penalty'], max_iter=1000, random_state=42)
log_reg_optimized.fit(X_train, y_train)
y_val_pred_optimized = log_reg_optimized.predict(X_val)
accuracy_optimized = accuracy_score(y_val, y_val_pred_optimized)
print(f"Optimized Model Accuracy on Validation Set: {accuracy_optimized}")

Optimized Model Accuracy on Validation Set: 0.6426694860950917


In [10]:
if accuracy_optimized == accuracy:
    print("We get the same results from the optimized model as we do for our original model. Other methods could be inplemented including using BERT for increased nuance detection")
elif accuracy_optimized > accuracy:
    print("We get better results from the optimized model than we do for our original model. Other methods could be inplemented including using BERT for increased nuance detection")
else:
    print("We get worse results from the optimized model than we do for our original model. Other methods could be inplemented including using BERT for increased nuance detection")

We get the same results from the optimized model as we do for our original model. Other methods could be inplemented including using BERT for increased nuance detection


In [11]:
# Make a submission file
X_test_counts = vectorizer.transform(test_df['Phrase'])
y_test_pred = log_reg_optimized.predict(X_test_counts)
submission_df = pd.DataFrame({'PhraseId': test_df['PhraseId'], 'Sentiment': y_test_pred})
submission_df.to_csv('submission.csv', index=False)

ValueError: np.nan is an invalid document, expected byte or unicode string.