In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
#from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.stem import PorterStemmer
import string
import joblib
import re
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

#nltk.download('punkt')
#nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/reyessou/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
# Assuming you have your dataset in a pandas DataFrame with two columns: 'reviews' and 'label'
df = pd.read_csv('IMDB Dataset.csv')  # Uncomment and load your dataset accordingly

In [3]:
df = df.rename(columns={'review': 'reviews'})

In [4]:
# Custom list of very frequent words (you can update this list based on your dataset)
custom_stop_words = set(['movie', 'film', 'character', 'plot'])  # Add domain-specific frequent words if needed
all_stop_words = list(ENGLISH_STOP_WORDS.union(custom_stop_words))

In [5]:
# Function to preprocess text: remove symbols, stop words, frequent words, etc.
def preprocess_text(text):
    # 1. Convert text to lowercase
    text = text.lower()
    
    # 2. Remove punctuation and special characters using regex
    text = re.sub(r'[^\w\s]', '', text)  # Only keep alphanumeric and spaces
    
    # 3. Tokenize the text into words
    tokens = word_tokenize(text)
    
    # 4. Remove stopwords (optional, if you want to keep stopwords, skip this step)
    #stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in all_stop_words]
    
    # 5. Apply stemming to each word
    #stemmed_tokens = [PorterStemmer.stem(token) for token in tokens]
    p_stemmer = PorterStemmer()

    # stem tokenized text and print first 500 tokens
    stemmed_tokens = [p_stemmer.stem(word) for word in tokens]
    
    return ' '.join(stemmed_tokens)

In [6]:
# Preprocess the reviews
df['clean_reviews'] = df['reviews'].apply(preprocess_text)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['clean_reviews'], df['sentiment'], test_size=0.2, random_state=42)

In [7]:
# Create a TfidfVectorizer to extract 1-grams and 2-grams, remove stop words, and limit frequent words
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.85, min_df=5)

In [8]:
# Define the  model pipeline
pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),  # Use TF-IDF instead of raw counts
    ('svc', LinearSVC()) # Use linear kernel for classification
])

In [9]:
# Define the hyperparameter grid for GridSearchCV
param_grid = {
    'svc__C': [0.5,1,5],  # Regularization parameter
    'svc__tol': [1e-2, 1e-3],  # Tolerance for stopping criteria
    'svc__max_iter': [500000, 1000000],  # Maximum iterations
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose = 2, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Extract the best pipeline, including the fitted TfidfVectorizer
best_pipeline = grid_search.best_estimator_

# Print the best parameters and the best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Accuracy Score:", grid_search.best_score_)

# Save the entire pipeline (including the TfidfVectorizer) to a file
#joblib.dump(best_pipeline, 'best_pipeline.pkl')

# Evaluate the best model on the test set
test_score = best_pipeline.score(X_test, y_test)
print("Test Set Accuracy:", test_score)

# Access the fitted TfidfVectorizer
fitted_tfidf = best_pipeline.named_steps['tfidf']

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Hyperparameters: {'svc__C': 1, 'svc__max_iter': 500000, 'svc__tol': 0.01}
Best Accuracy Score: 0.89645
Test Set Accuracy: 0.9037


In [10]:
Xnew = input('Enter your review:')

Enter your review: The movie was not different from many others, it did not leave an impression


In [13]:
# Preprocessing and tfidf-ing the new review
df1= pd.DataFrame.from_dict({'Newreview':[Xnew]})
FeaturesXnew = df1['Newreview'].apply(preprocess_text)
FeaturesXnewtfidf = fitted_tfidf.transform(FeaturesXnew)
fitted_svc = best_pipeline.named_steps['svc']
ynew=fitted_svc.predict(FeaturesXnewtfidf)

In [14]:
print("%s - This review is %s" % (Xnew, ynew[0]))

The movie was not different from many others, it did not leave an impression - This review is negative
[CV] END .......svc__C=1, svc__max_iter=100000, svc__tol=0.1; total time=  24.4s
[CV] END ......svc__C=1, svc__max_iter=500000, svc__tol=0.01; total time=  16.1s
[CV] END .......svc__C=5, svc__max_iter=100000, svc__tol=0.1; total time=  18.0s
[CV] END .......svc__C=5, svc__max_iter=500000, svc__tol=0.1; total time=  18.6s
[CV] END ......svc__C=10, svc__max_iter=100000, svc__tol=0.1; total time=  19.0s
[CV] END .......svc__C=1, svc__max_iter=100000, svc__tol=0.1; total time=  24.9s
[CV] END .......svc__C=1, svc__max_iter=500000, svc__tol=0.1; total time=  14.7s
[CV] END .......svc__C=5, svc__max_iter=100000, svc__tol=0.1; total time=  18.6s
[CV] END .......svc__C=5, svc__max_iter=500000, svc__tol=0.1; total time=  19.0s
[CV] END ......svc__C=10, svc__max_iter=100000, svc__tol=0.1; total time=  19.5s
[CV] END ......svc__C=1, svc__max_iter=100000, svc__tol=0.01; total time=  24.4s
[CV] E