<a href="https://colab.research.google.com/github/dolmarawat/NLP-332-Final-Project/blob/main/NLPFinalProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Installing NLP Libraries

In [6]:
!pip install nltk spacy emoji
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m157.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


##Mounting Google Drive

In [9]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [10]:
import glob
import os
import pandas as pd

# Ensure folder_path is defined. It was defined in a previous cell.
# If this cell is run independently, ensure folder_path is set.
folder_path = "/content/drive/MyDrive/IST332NLPFinalProjectData"

print(f"Checking for CSV files in: {folder_path}")

csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

print(f"Found CSV files: {csv_files}")

dfs = []

if not csv_files:
    print("Warning: No CSV files were found. Please verify the 'folder_path' and your Google Drive connection.")
    combined_df = pd.DataFrame()
else:
    for file in csv_files:
        df = pd.read_csv(file)

        # add filename as a source label
        df["source"] = os.path.basename(file)

        # drop empty rows
        if "text" in df.columns:
            df = df[df["text"].notna()]
            df = df[df["text"].str.strip() != ""]

        dfs.append(df)

    if not dfs:
        print("Warning: All found CSV files resulted in empty DataFrames after processing.")
        combined_df = pd.DataFrame()
    else:
        combined_df = pd.concat(dfs, ignore_index=True)
        print("Total combined comments:", len(combined_df))
        print(combined_df.head())


Checking for CSV files in: /content/drive/MyDrive/IST332NLPFinalProjectData
Found CSV files: ['/content/drive/MyDrive/IST332NLPFinalProjectData/comments_whats so bad about lab grown diamonds.csv', '/content/drive/MyDrive/IST332NLPFinalProjectData/comments_justification of lab over natural diamonds (lab leaning).csv', '/content/drive/MyDrive/IST332NLPFinalProjectData/lab created diamonds are also diamonds (lab leaning).csv', '/content/drive/MyDrive/IST332NLPFinalProjectData/do women generally care about either?.csv', '/content/drive/MyDrive/IST332NLPFinalProjectData/have attitudes toward lab grown diamonds changed recently? .csv', '/content/drive/MyDrive/IST332NLPFinalProjectData/Diamondscomments1_data .csv', '/content/drive/MyDrive/IST332NLPFinalProjectData/Diamondscomments2_data.csv', '/content/drive/MyDrive/IST332NLPFinalProjectData/Diamondscomments3_data.csv', '/content/drive/MyDrive/IST332NLPFinalProjectData/Lab-Natural_Debate.csv', '/content/drive/MyDrive/IST332NLPFinalProjectData

In [11]:
display(combined_df.head())

Unnamed: 0,username,comment,source,comments,comment_id,score,timestamp,post_title,post_url,author,...,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,text,points,time_posted
0,lkvwfurry,Literally nothing except that Big Diamond woul...,comments_whats so bad about lab grown diamonds...,,,,,,,,...,,,,,,,,,,
1,Noize42,Sounds like a pretty big plus. This is how we ...,comments_whats so bad about lab grown diamonds...,,,,,,,,...,,,,,,,,,,
2,Fartimer,As a millennial I'm not going to buy another d...,comments_whats so bad about lab grown diamonds...,,,,,,,,...,,,,,,,,,,
3,seansand,"""Big Diamond"" is literally just one company, D...",comments_whats so bad about lab grown diamonds...,,,,,,,,...,,,,,,,,,,
4,JConRed,"Interesting. Is it weird that, while I've neve...",comments_whats so bad about lab grown diamonds...,,,,,,,,...,,,,,,,,,,


#Data Preprocessing and Cleaning

### Downloading Preprocessing libraries

In [16]:

import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab') # Added to resolve the LookupError

# Initialize the Lemmatizer and Stopwords list
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Cleans text by:
    1. Lowercasing
    2. Removing URLs and links
    3. Removing punctuation
    4. Removing stopwords (common words like 'the', 'is')
    5. Lemmatizing (converting words to their base form, e.g., 'buying' -> 'buy')
    """
    # 1. Convert to lower case
    text = text.lower()

    # 2. Remove URLs (common in Reddit data)
    text = re.sub(r'http\S+|www\.\S+', '', text)

    # 3. Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # 4. Tokenize (split into words)
    tokens = word_tokenize(text)

    # 5. Remove stopwords and Lemmatize
    cleaned_tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words and word.isalnum()
    ]

    # Join back into a single string
    return " ".join(cleaned_tokens)

# Apply the function to your DataFrame
print("Preprocessing data... this may take a moment.")
combined_df['cleaned_text'] = combined_df['text'].apply(lambda x: preprocess_text(str(x)))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Preprocessing data... this may take a moment.


##Data Exploration

In [18]:

# Display the Before v
print("\n--- Data Cleaning Sample ---")
print(combined_df[['text', 'cleaned_text']].head())

# Calculate simple statistics for the update (Required by Task 2 Rubric)
total_comments = len(combined_df)
avg_length_original = combined_df['text'].apply(lambda x: len(str(x).split())).mean()
avg_length_clean = combined_df['cleaned_text'].apply(lambda x: len(str(x).split())).mean()

print("\n--- Project Progress Statistics ---")
print(f"Total Records Collected: {total_comments}")
print(f"Avg Word Count (Original): {avg_length_original:.2f}")
print(f"Avg Word Count (Cleaned):  {avg_length_clean:.2f}")


--- Data Cleaning Sample ---
  text                                       cleaned_text
0  NaN  literally nothing except big diamond would los...
1  NaN  sound like pretty big plus millenials killing ...
2  NaN  millennial im going buy another diamond id lik...
3  NaN          big diamond literally one company de beer
4  NaN  interesting weird ive never inclined get diamo...

--- Project Progress Statistics ---
Total Records Collected: 8879
Avg Word Count (Original): 1.94
Avg Word Count (Cleaned):  14.58


In [20]:
from nltk.probability import FreqDist

# Combine all cleaned text into a single string, then tokenize
all_words = ' '.join(combined_df['cleaned_text']).split()

# Calculate frequency distribution
fdist = FreqDist(all_words)

# Get the 10 most common words
most_common_words = fdist.most_common(25)

print("\n--- 10 Most Frequent Words in Cleaned Text ---")
for word, frequency in most_common_words:
    print(f"{word}: {frequency}")



--- 10 Most Frequent Words in Cleaned Text ---
diamond: 4573
nan: 3766
lab: 2552
ring: 1960
like: 1276
natural: 1110
would: 1028
one: 1016
get: 969
grown: 953
want: 817
people: 785
stone: 775
think: 743
mined: 680
know: 641
make: 616
thing: 580
love: 574
also: 566
way: 539
much: 536
even: 535
really: 497
nta: 494
