In [2]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk





imdb_data = pd.read_csv('IMDB-Dataset.csv')

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# Apply preprocessing to reviews
imdb_data['cleaned_review'] = imdb_data['review'].apply(preprocess_text)

# Splitting the data into train and test sets
X = imdb_data['cleaned_review']
y = imdb_data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Training a Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Accuracy: 0.89

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.87      0.88      4961
           1       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk


im=pd.read_csv('IMDB-Dataset.csv')

tokens=nltk.word_tokenize(im['review'][1000])
tokens[:100]

['This',
 'movie',
 'is',
 'awful',
 ',',
 'I',
 'ca',
 "n't",
 'even',
 'be',
 'bothered',
 'to',
 'write',
 'a',
 'review',
 'on',
 'this',
 'garbage',
 '!',
 'All',
 'i',
 'will',
 'say',
 'it',
 'is',
 'one',
 'of',
 'the',
 'most',
 'boring',
 'films',
 'I',
 "'ve",
 'ever',
 'seen.',
 '<',
 'br',
 '/',
 '>',
 '<',
 'br',
 '/',
 '>',
 'And',
 'the',
 'acting',
 'is',
 'very',
 'bad',
 '.',
 'The',
 'boy',
 'who',
 'plays',
 'the',
 'main',
 'character',
 'really',
 'annoys',
 'me',
 ',',
 'he',
 "'s",
 'got',
 'the',
 'same',
 'expression',
 'on',
 'his',
 'face',
 'through',
 'out',
 'the',
 'movie',
 '.',
 'I',
 'just',
 'want',
 'to',
 'slap',
 'him',
 '!',
 'Basically',
 '80',
 '%',
 'of',
 'the',
 'movie',
 'is',
 'slow',
 'motion',
 'shots',
 'of',
 'skateboarders',
 ',',
 'weird',
 'music',
 ',',
 'and',
 'utter']

In [4]:

nltk.pos_tag(tokens)

[('This', 'DT'),
 ('movie', 'NN'),
 ('is', 'VBZ'),
 ('awful', 'JJ'),
 (',', ','),
 ('I', 'PRP'),
 ('ca', 'MD'),
 ("n't", 'RB'),
 ('even', 'RB'),
 ('be', 'VB'),
 ('bothered', 'VBN'),
 ('to', 'TO'),
 ('write', 'VB'),
 ('a', 'DT'),
 ('review', 'NN'),
 ('on', 'IN'),
 ('this', 'DT'),
 ('garbage', 'NN'),
 ('!', '.'),
 ('All', 'DT'),
 ('i', 'NN'),
 ('will', 'MD'),
 ('say', 'VB'),
 ('it', 'PRP'),
 ('is', 'VBZ'),
 ('one', 'CD'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('most', 'RBS'),
 ('boring', 'JJ'),
 ('films', 'NNS'),
 ('I', 'PRP'),
 ("'ve", 'VBP'),
 ('ever', 'RB'),
 ('seen.', 'VBN'),
 ('<', 'NNP'),
 ('br', 'NN'),
 ('/', 'NNP'),
 ('>', 'NNP'),
 ('<', 'NNP'),
 ('br', 'NN'),
 ('/', 'NNP'),
 ('>', 'NNP'),
 ('And', 'CC'),
 ('the', 'DT'),
 ('acting', 'NN'),
 ('is', 'VBZ'),
 ('very', 'RB'),
 ('bad', 'JJ'),
 ('.', '.'),
 ('The', 'DT'),
 ('boy', 'NN'),
 ('who', 'WP'),
 ('plays', 'VBZ'),
 ('the', 'DT'),
 ('main', 'JJ'),
 ('character', 'NN'),
 ('really', 'RB'),
 ('annoys', 'VBZ'),
 ('me', 'PRP'),
 (',', ',')