###  Importing Necessary Libraries

In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from spellchecker import SpellChecker
import string
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

### Initializing NLTK Resources and Preprocessing Tools

In [2]:
#download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

#initialize stemmer, lemmatizer, and spell checker
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
spell = SpellChecker()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/snehamondal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/snehamondal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/snehamondal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/snehamondal/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


###  Loading the Dataset

In [3]:
#load the dataset
df = pd.read_csv('/Users/siva/Downloads/all_comments.csv')

### Identifying and Correcting Misspelled Words

In [4]:
#create a list of all words in the dataset
all_words = []
df['Comment'].dropna().apply(lambda x: all_words.extend(word_tokenize(x.lower())))
all_words = [word for word in all_words if word.isalpha()]  # Filter out non-alphabetic

#identify the unique words and their frequencies
word_freq = Counter(all_words)

#identify misspelled words
misspelled_words = spell.unknown(word_freq.keys())

#create a dictionary of corrected spellings
corrected_spellings = {word: spell.correction(word) for word in misspelled_words}

### Replacing Misspelled Words in Comments

In [5]:
def replace_misspelled(text, corrections):
    if text is None:
        return text  #return None if input is None
    words = word_tokenize(text.lower())
    corrected_text = []
    for word in words:
        #check if the word is in corrections, and it's not None
        corrected_word = corrections.get(word)
        if corrected_word is None:
            corrected_word = word
        corrected_text.append(corrected_word)
    return " ".join(corrected_text)

#apply the function to the DataFrame
df['Corrected_Comment'] = df['Comment'].apply(lambda x: replace_misspelled(x, corrected_spellings) if pd.notnull(x) else x)

###  Preprocessing Text Data (Tokenization, Stemming, Lemmatization, Removing Stopwords)

In [6]:
def preprocess_text(text):
    #check if the text is None
    if text is None:
        return ''

    #tokenization and lowercasing
    tokens = word_tokenize(text.lower())

    #remove punctuation and non-alphabetic characters
    tokens = [t for t in tokens if t.isalpha()]

    #stemming and lemmatization
    stemmed = [stemmer.stem(t) for t in tokens if t is not None]
    lemmatized = [lemmatizer.lemmatize(t, pos='v') for t in stemmed if t is not None]

    #remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [w for w in lemmatized if not w in stop_words]

    return ' '.join(filtered_words)

#apply the preprocessing function to the DataFrame
df['Processed_Comment'] = df['Corrected_Comment'].apply(preprocess_text)

### Saving the Processed Data

In [7]:
#save the processed dataframe to a new CSV file
processed_csv_path = '/Users/siva/Downloads/processed_comments.csv'
df.to_csv(processed_csv_path, index=False)

print("Processing complete. The processed comments are saved at:", processed_csv_path)

Processing complete. The processed comments are saved at: /Users/snehamondal/Desktop/Data/processed_comments.csv


In [8]:
#load the processed comments from the CSV file
processed_comments_path = '/Users/siva/Downloads//processed_comments.csv'
df_processed = pd.read_csv(processed_comments_path)

#display the first few rows of the dataframe
print(df_processed.head())

       VideoID                                            Comment  \
0  bmR98lYurtc  Forest don’t play premier league football, the...   
1  bmR98lYurtc                              Hope bren&#39;s okay.   
2  bmR98lYurtc  The referee is such a weird guy that he needs ...   
3  bmR98lYurtc  Gotta be wary of offences &amp; çards............   
4  bmR98lYurtc                               Semangat untuk juara   

               Author             Timestamp  \
0    @JackSouth-gm6qr  2023-12-16T21:04:02Z   
1            @alae709  2023-12-16T21:03:18Z   
2        @junodoh6358  2023-12-16T20:54:08Z   
3  @williamwilkes9873  2023-12-16T20:25:49Z   
4     @joypradana9237  2023-12-16T19:38:27Z   

                                   Corrected_Comment  \
0  forest don ’ i play premier league football , ...   
1                        hope been & # 39 ; i okay .   
2  the referee is such a weird guy that he needs ...   
3  got ta be wary of offences & amp ; cards ........   
4                     

### Preparing Data for TF-IDF Vectorization

In [9]:
#load the preprocessed comments and handle any missing values
comments = df['Processed_Comment'].fillna('')  # Replace NaNs with empty strings

### Initializing and Applying TF-IDF Vectorizer

In [10]:
#initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(comments)

### Analyzing TF-IDF Results

In [11]:
#retrieve feature names (words) from the vectorizer
feature_names = tfidf_vectorizer.get_feature_names()

#extract and analyze TF-IDF scores for the first document (comment)
first_document_vector = tfidf_matrix[0]

#convert the TF-IDF results for the first document into a readable DataFrame
df_tfidf = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])

df_tfidf = df_tfidf.sort_values(by=["tfidf"], ascending=False)

print(df_tfidf.head(10))

            tfidf
rid      0.450850
cooper   0.428308
forest   0.409370
premier  0.319684
footbal  0.299802
leagu    0.281380
go       0.255152
get      0.235711
play     0.231919
ростов   0.000000


