In [31]:
#Import libraries
import pandas as pd
import numpy as np
import nltk

In [32]:
#import combined_cleaned.csv dataset, but drop index
df = pd.read_csv('../data/combined_cleaned.csv', index_col=0)


In [33]:
#Check for nan values in 'text' column
df['text'].isna().sum()

0

In [42]:
#Importing other nltk libraries
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
import string


# Instantiate Tokenizer
tokenizer = RegexpTokenizer(r'\w+') 



lemmatizer = WordNetLemmatizer()

In [36]:
#Print columns of df
print(df.columns)

Index(['id', 'title', 'text', 'score', 'label'], dtype='object')


In [37]:
df['text'].head()

Unnamed: 0
0    ['many', 'people', 'experience', 'increased', ...
2    ['noticed', 'today', 'kept', 'randomly', 'laug...
3    ['manic', 'two', 'week', 'yesterday', 'fell', ...
4    ['school', 'teacher', 'love', 'class', 'couple...
5    ['anyone', 'feel', 'like', 'lone', 'forever', ...
Name: text, dtype: object

In [38]:
#print number of rows df
print(len(df))

3514


## Vectorizing

In [43]:
def preprocess_text(text):
    # Check if the input is a string
    if isinstance(text, str):
        # Convert text to lowercase and remove punctuation
        text = "".join([word.lower() for word in text if word not in string.punctuation])
        # Tokenize text
        tokens = re.split('\W+', text)
        # Remove stopwords and lemmatize words
        text = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]

    return text

In [45]:
from sklearn.feature_extraction.text import CountVectorizer

# Create an instance of CountVectorizer and pass in the clean_text function as the analyzer parameter
count_vect = CountVectorizer(analyzer=preprocess_text)

# X_counts stores the vectorized version of the data
X_counts = count_vect.fit_transform(df['text'])

# Returns the number of rows and columns of X_counts i.e. 5567 text messages and 8104 unique words
print(X_counts.shape)

# Returns the unique words across alll text messages
print(count_vect.get_feature_names_out())

(3514, 17221)
['' 'aa' 'aaam' ... '아래' '이믈을' '퉁퉁해수퍼맨']


In [47]:
#export to csv 
df.to_csv('../data/combined_cleaned_vectorized.csv')

In [46]:
# Convert the matrix to a DataFrame
X_counts_df = pd.DataFrame(X_counts.toarray(), columns=count_vect.get_feature_names_out())

# Get the most common words
top_words = X_counts_df.sum().sort_values(ascending=False).head(10)
print('Top words:')
print(top_words)


Top words:
im        5217
like      3764
feel      2417
dont      2371
know      2018
get       1963
time      1909
ive       1895
people    1731
thing     1607
dtype: int64
