In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

In [2]:
df = pd.read_csv(r'D:\DATA SCIENCE\youtube_sentiment_analysis\dataset\reddit.csv', encoding='utf-8')


In [3]:
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37249 entries, 0 to 37248
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   clean_comment  37149 non-null  object
 1   category       37249 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 582.1+ KB


In [5]:
df.isnull().sum()

clean_comment    100
category           0
dtype: int64

In [6]:
df[df['clean_comment'].isna()]['category'].value_counts()

category
0    100
Name: count, dtype: int64

In [7]:
df = df.dropna()

In [8]:
df.duplicated().sum()

np.int64(350)

In [9]:
df.drop_duplicates(inplace=True)

In [11]:
df[(df['clean_comment'].str.strip() == '')]

Unnamed: 0,clean_comment,category
181,,0
4432,\n,0
10592,,0
16173,,0
32149,\n,0
34959,,0


In [12]:
df = df[~(df['clean_comment'].str.strip() == '')]

In [13]:
# Convert the 'clean_comment' column to lowercase
df['clean_comment'] = df['clean_comment'].str.lower()

# Verify the transformation by displaying the first few rows
df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_comment'] = df['clean_comment'].str.lower()


Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [14]:
df[df['clean_comment'].apply(lambda x: x.endswith(' ') or x.startswith(' '))]

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1
...,...,...
37241,let the janta decide not ulema clerics,0
37242,hona hai same with vaccination education insu...,0
37246,downvote karna tha par upvote hogaya,0
37247,haha nice,1


In [15]:
def preprocess_comments(df, column='clean_comment'):
    """
    Preprocess the comments column by:
    - Dropping NaNs and duplicates
    - Removing empty or whitespace-only strings
    - Converting to lowercase
    - Stripping leading/trailing whitespace
    - Removing newline characters
    
    Returns:
    - Cleaned DataFrame
    """
    df = df.dropna().drop_duplicates()
    df = df[~(df[column].str.strip() == '')]
    df[column] = df[column].str.lower()
    df[column] = df[column].str.strip()
    df[column] = df[column].str.replace('\n', ' ', regex=True)
    return df


In [16]:
df = preprocess_comments(df, column='clean_comment')

In [17]:
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them th...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [18]:
df.isnull().sum()

clean_comment    0
category         0
dtype: int64

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X = df['clean_comment']
y = df['category']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
X_train.shape

(29434,)

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
tfidf = TfidfVectorizer(max_features=5000)
# Step 3: Fit on training data only
X_train_tfidf = tfidf.fit_transform(X_train)

# Step 4: Transform test data using same vectorizer
X_test_tfidf = tfidf.transform(X_test)

In [32]:
X_train_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 571002 stored elements and shape (29434, 5000)>

In [33]:

tfidf_df = pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf.get_feature_names_out())
print(tfidf_df.head())

   000  100  1000  101  10th  120  150  180ml  1947  1984  ...  yourself  \
0  0.0  0.0   0.0  0.0   0.0  0.0  0.0    0.0   0.0   0.0  ...       0.0   
1  0.0  0.0   0.0  0.0   0.0  0.0  0.0    0.0   0.0   0.0  ...       0.0   
2  0.0  0.0   0.0  0.0   0.0  0.0  0.0    0.0   0.0   0.0  ...       0.0   
3  0.0  0.0   0.0  0.0   0.0  0.0  0.0    0.0   0.0   0.0  ...       0.0   
4  0.0  0.0   0.0  0.0   0.0  0.0  0.0    0.0   0.0   0.0  ...       0.0   

   youth  youtube  yra  yrs  zee  zen  zero  zindabad  zone  
0    0.0      0.0  0.0  0.0  0.0  0.0   0.0       0.0   0.0  
1    0.0      0.0  0.0  0.0  0.0  0.0   0.0       0.0   0.0  
2    0.0      0.0  0.0  0.0  0.0  0.0   0.0       0.0   0.0  
3    0.0      0.0  0.0  0.0  0.0  0.0   0.0       0.0   0.0  
4    0.0      0.0  0.0  0.0  0.0  0.0   0.0       0.0   0.0  

[5 rows x 5000 columns]


### word to vec 