In [27]:
import pandas as pd
import re  # regex
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import  TfidfVectorizer

In [28]:
# previewing iniail values
df = pd.read_csv("imdb_master.csv", sep=",", encoding="ISO-8859-1")
df.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [29]:
# checking initail values
print(df['type'].unique())
print(df['label'].unique())

['test' 'train']
['neg' 'pos' 'unsup']


In [30]:
df = df.drop(df.columns[0], axis=1)
df = df.drop(df.columns[3], axis=1)

df

Unnamed: 0,type,review,label
0,test,Once again Mr. Costner has dragged out a movie...,neg
1,test,This is an example of why the majority of acti...,neg
2,test,"First of all I hate those moronic rappers, who...",neg
3,test,Not even the Beatles could write songs everyon...,neg
4,test,Brass pictures (movies is not a fitting word f...,neg
...,...,...,...
99995,train,"Delightfully awful! Made by David Giancola, a ...",unsup
99996,train,"Watching Time Chasers, it obvious that it was ...",unsup
99997,train,At the beginning we can see members of Troma t...,unsup
99998,train,"The movie was incredible, ever since I saw it ...",unsup


In [31]:
nltk.download("stopwords")
stop_words = stopwords.words("english")
CLEANR = re.compile("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
nltk.download("punkt")
ps = PorterStemmer()
nltk.download("wordnet")
nltk.download("omw-1.4")

wnl = WordNetLemmatizer()


def remove_stop_words(sentence):
    # Split the sentence into individual words
    words = sentence.split()
    # Use a list comprehension to remove stop words
    filtered_words = [word for word in words if word not in stop_words]
    # Join the filtered words back into a sentence
    return " ".join(filtered_words)


def remove_html_tags(sentence):
    return re.sub(CLEANR, "", sentence)


def remove_punctuation(sentence):
    return sentence.translate(str.maketrans("", "", string.punctuation))


def remove_none_words(sentence):
    sentence = re.sub(r"[0-9]+", "", sentence)
    return re.sub(r"\W+", " ", sentence)


def apply_stemmer(sentence):
    return ps.stem(sentence)


def apply_lemmatization(sentence):
    return wnl.lemmatize(sentence)


def get_words_tokens(sentence):
    return word_tokenize(sentence)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [32]:
# rows data clean
review_token = []
for index, row in df.iterrows():
    row["review"] = remove_html_tags(row["review"])
    row["review"] = remove_punctuation(row["review"])
    row["review"] = remove_stop_words(row["review"])
    row["review"] = remove_none_words(row["review"])
    row["review"] = row["review"].lower()
    row["review"] = apply_stemmer(row["review"])
    row["review"] = apply_lemmatization(row["review"])
    review_token.append(get_words_tokens(row["review"]))

df["review_token"] = review_token
df.head()

Unnamed: 0,type,review,label,review_token
0,test,once mr costner dragged movie far longer neces...,neg,"[once, mr, costner, dragged, movie, far, longe..."
1,test,this example majority action films generic bor...,neg,"[this, example, majority, action, films, gener..."
2,test,first i hate moronic rappers couldnt act gun p...,neg,"[first, i, hate, moronic, rappers, couldnt, ac..."
3,test,not even beatles could write songs everyone li...,neg,"[not, even, beatles, could, write, songs, ever..."
4,test,brass pictures movies fitting word really some...,neg,"[brass, pictures, movies, fitting, word, reall..."


In [33]:
tfidf = TfidfVectorizer(norm=None, smooth_idf=False)
train_data = df.query('type == "train"').head()["review"]
x_tfidf = tfidf.fit_transform(train_data)

vecor = pd.DataFrame(x_tfidf.toarray(), columns=tfidf.get_feature_names_out())
vecor

Unnamed: 0,ability,absence,absurd,accomplices,across,acting,action,actors,actress,actually,...,worker,worried,worse,worst,would,wow,wrought,youre,youthful,zsigmond
0,0.0,0.0,5.218876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.916291,0.0,0.0,0.0,0.0,2.609438
1,0.0,0.0,0.0,2.609438,0.0,2.609438,2.609438,0.0,0.0,2.609438,...,2.609438,2.609438,2.609438,0.0,1.916291,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,2.609438,0.0,0.0,5.218876,2.609438,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.609438,0.0
3,0.0,2.609438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.609438,0.0,0.0,0.0,0.0
4,2.609438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.609438,0.0,0.0,2.609438,2.609438,0.0,0.0
