## Sentiment Analysis for Reddit Data

In [1]:
import praw
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
import pandas as pd
import spacy
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [2]:
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\devuj\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\devuj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\devuj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\devuj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# 1.	Data Collection

In [3]:
reddit = praw.Reddit(client_id='_DImiptAS4SzysaUp5ZhUQ',
                     client_secret='D4PNJf-0_ddWTJtmL9egt-Gie48pzQ',
                     user_agent='Responsible-Art5268')


In [4]:
headlines = set()

### Get the headlines

In [5]:
# Different sorts and time filters
sort_methods = ['new', 'hot', 'top', 'controversial']
time_filters = ['day', 'week', 'month', 'year', 'all']

# List of related subreddits
related_subreddits = ['economy', 'finance', 'business', 'investing']

# Function to fetch headlines from a given subreddit
def fetch_headlines_from_subreddit(subreddit_name):
    subreddit = reddit.subreddit(subreddit_name)
    for sort_method in sort_methods:
        if sort_method in ['top', 'controversial']:
            for time_filter in time_filters:
                fetch_headlines(subreddit, sort_method, time_filter)
                if len(headlines) >= 10000:
                    return
        else:
            fetch_headlines(subreddit, sort_method)
            if len(headlines) >= 10000:
                return

# Function to fetch headlines using a specific sort and time filter
def fetch_headlines(subreddit, sort_method, time_filter=None):
    if time_filter:
        submissions = getattr(subreddit, sort_method)(time_filter=time_filter, limit=None)
    else:
        submissions = getattr(subreddit, sort_method)(limit=None)

    for submission in submissions:
        headlines.add(submission.title)
        if len(headlines) >= 10000:
            break

# Iterate and collect headlines from each subreddit
for subreddit_name in related_subreddits:
    fetch_headlines_from_subreddit(subreddit_name)
    if len(headlines) >= 10000:
        break

print(f"Total headlines collected: {len(headlines)}")

Total headlines collected: 10000


### Converting to lowercase

In [6]:
#Converting to lowercase
lowercaseHeadlines = set()
for item in headlines:
    lowercaseHeadlines.add(item.lower())

### Removing the special characters

In [7]:
import re
import pandas as pd

# Function to remove special characters
def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

# Clean the headlines and create a new set
cleaned_headlines = {remove_special_characters(item) for item in lowercaseHeadlines}

# Counting the number of headlines with special characters
count_special = sum(bool(re.search(r'[^a-zA-Z0-9\s]', item)) for item in lowercaseHeadlines)

print(f"Number of items with special characters: {count_special}")

Number of items with special characters: 8060


### Annotate the dataset

In [8]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

annotated_headlines = []
for headline in cleaned_headlines:
    score = sia.polarity_scores(headline)
    compound = score['compound']
    
    if compound >= 0.05:
        sentiment = 'positive'
    elif compound <= -0.05:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'

    annotated_headlines.append((headline, sentiment))

# Example: print first few annotated headlines
print(annotated_headlines[:5])

[('', 'neutral'), ('warren buffett is now richer than mark zuckerberg after tech titan lost 31 billion following metas stock crash', 'negative'), ('car loans could be the next subprime crisis thanks republicans', 'negative'), ('how bonds ate the entire financial system  a very short very wild history of the market that will shape the next financial crisis', 'negative'), ('china produces more automobiles than the us japan south korea and germany combined', 'neutral')]


### Convert to dataframe

In [9]:
headline_df = pd.DataFrame.from_records(annotated_headlines, columns=['Headline', 'Label'])
headline_df.head()

Unnamed: 0,Headline,Label
0,,neutral
1,warren buffett is now richer than mark zuckerb...,negative
2,car loans could be the next subprime crisis th...,negative
3,how bonds ate the entire financial system a v...,negative
4,china produces more automobiles than the us ja...,neutral


### Save to csv

In [10]:
# Save the DataFrame to a CSV file
csv_file_path = 'annotated_headlines.csv'  
headline_df.to_csv(csv_file_path, index=False)

### Remove any null headlines present after the preprocessing

In [11]:
finalHeadline_df = headline_df[headline_df['Headline'].notna() & headline_df['Headline'].str.strip().astype(bool)]
finalHeadline_df.head()

Unnamed: 0,Headline,Label
1,warren buffett is now richer than mark zuckerb...,negative
2,car loans could be the next subprime crisis th...,negative
3,how bonds ate the entire financial system a v...,negative
4,china produces more automobiles than the us ja...,neutral
5,biden defends his handling of the economy amid...,neutral


In [12]:
headline_df.shape

(9987, 2)

# 2.	Preprocessing 

In [26]:
def preprocess_text(text):
    # Tokenizing the words
    tokens = word_tokenize(text)

    # Removing non-alpha characters
    tokens = [word for word in tokens if word.isalpha()] 

    # Removing stopwords
    stop_words = set(stopwords.words('english'))    
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming and Lemmatizing the words
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens] 

    return ' '.join(tokens)

# Apply the preprocess_text function using .loc
finalHeadline_df.loc[:, 'Processed_Headline'] = finalHeadline_df['Headline'].apply(preprocess_text)
finalHeadline_df.head()

Unnamed: 0,Headline,Label,Processed_Headline
1,warren buffett is now richer than mark zuckerb...,negative,warren buffett richer mark zuckerberg tech tit...
2,car loans could be the next subprime crisis th...,negative,car loan could next subprim crisi thank republ...
3,how bonds ate the entire financial system a v...,negative,bond ate entir financi system short wild histo...
4,china produces more automobiles than the us ja...,neutral,china produc automobil u japan south korea ger...
5,biden defends his handling of the economy amid...,neutral,biden defend handl economi amid latest rough i...


# 3.	Feature Extraction 

### TF-IDF

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(finalHeadline_df['Processed_Headline'])

In [28]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(tfidf_features, finalHeadline_df['Label'], test_size=0.2, random_state=42)

scaler_tfidf = StandardScaler()
X_train_tfidf_scaled = scaler_tfidf.fit_transform(X_train_tfidf.toarray()) 
X_test_tfidf_scaled = scaler_tfidf.transform(X_test_tfidf.toarray())  
classifier_tfidf = LogisticRegression(max_iter=1000)
classifier_tfidf.fit(X_train_tfidf_scaled, y_train_tfidf)

# Make predictions
y_pred_tfidf = classifier_tfidf.predict(X_test_tfidf_scaled)

# Evaluate the classifier
print("TF-IDF Performance (Scaled):")
print(classification_report(y_test_tfidf, y_pred_tfidf))

TF-IDF Performance (Scaled):
              precision    recall  f1-score   support

    negative       0.68      0.67      0.68       700
     neutral       0.63      0.70      0.66       658
    positive       0.69      0.63      0.66       640

    accuracy                           0.67      1998
   macro avg       0.67      0.67      0.67      1998
weighted avg       0.67      0.67      0.67      1998



### Word2Vec

In [29]:
import gensim

# Tokenized documents
tokenized_docs = [doc.split() for doc in finalHeadline_df['Processed_Headline']]

# Train a Word2Vec model
word2vec_model = gensim.models.Word2Vec(tokenized_docs, vector_size=100, window=5, min_count=1, workers=4)

# Function to create document vectors
def document_vector_word2vec(doc):
    words = doc.split()
    word_vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    if len(word_vectors) == 0:
        return np.zeros(word2vec_model.vector_size)  # Return a zero vector if no words are found
    else:
        return np.mean(word_vectors, axis=0)

word2vec_features = np.array([document_vector_word2vec(doc) for doc in finalHeadline_df['Processed_Headline']])


In [30]:
# Word2Vec
X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec = train_test_split(word2vec_features, finalHeadline_df['Label'], test_size=0.2, random_state=42)
classifier_word2vec = LogisticRegression()
classifier_word2vec.fit(X_train_word2vec, y_train_word2vec)
y_pred_word2vec = classifier_word2vec.predict(X_test_word2vec)

# Evaluate the classifier
print("Word2Vec Performance:")
print(classification_report(y_test_word2vec, y_pred_word2vec))

Word2Vec Performance:
              precision    recall  f1-score   support

    negative       0.41      0.66      0.50       700
     neutral       0.41      0.50      0.45       658
    positive       0.49      0.05      0.09       640

    accuracy                           0.41      1998
   macro avg       0.43      0.40      0.35      1998
weighted avg       0.43      0.41      0.35      1998



### GloVe

In [31]:
import numpy as np
import gensim.downloader as api

# Load pre-trained GloVe model (choose an appropriate model)
glove_model = api.load("glove-wiki-gigaword-100")  # For example

# Function to create document vectors
def document_vector_glove(doc):
    words = doc.split()
    word_vectors = [glove_model[word] for word in words if word in glove_model]
    if len(word_vectors) == 0:
        return np.zeros(glove_model.vector_size)  # Return a zero vector if no words are found
    else:
        return np.mean(word_vectors, axis=0)

glove_features = np.array([document_vector_glove(doc) for doc in finalHeadline_df['Processed_Headline']])

In [32]:
# GloVe
X_train_glove, X_test_glove, y_train_glove, y_test_glove = train_test_split(glove_features, finalHeadline_df['Label'], test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_glove_scaled = scaler.fit_transform(X_train_glove)
X_test_glove_scaled = scaler.transform(X_test_glove)

classifier_glove = LogisticRegression(max_iter=1000)
classifier_glove.fit(X_train_glove_scaled, y_train_glove)
y_pred_glove = classifier_glove.predict(X_test_glove)

# Evaluate the classifier
print("GloVe Performance:")
print(classification_report(y_test_glove, y_pred_glove))

GloVe Performance:
              precision    recall  f1-score   support

    negative       0.55      0.76      0.64       700
     neutral       0.64      0.29      0.40       658
    positive       0.54      0.62      0.57       640

    accuracy                           0.56      1998
   macro avg       0.58      0.56      0.54      1998
weighted avg       0.58      0.56      0.54      1998



### Evaluating which Feature Extraction performs better among the three 
1. **TF-IDF Performance:**
    - Accuracy: 67% 
    - Precision, Recall, F1-Score: The three sentiment classifications are very equally distributed.
    - Observation: TF-IDF appears to function fairly well, indicating that the frequency and distinctiveness of the context in the headlines serve as reliable markers of mood. Recall and precision seem to be fairly balanced with this strategy.
    <br><br>

2. **Word2Vec Performance:**
    - Accuracy: 43%
    - Precision, Recall, and F1-Score: Much worse results, particularly with the positive class that has essentially no recall.
    - Observation: The low performance could be attributed to the Word2Vec model's inability to adequately capture semantic relationships in the data, or to the peculiar vocabulary prevalent in Reddit headlines that isn't adequately represented in the Word2Vec model's training corpus.
    <br><br>
3. **GloVe Performance:**
    - Accuracy: 55%
    - Precision, Recall, and F1-Score: Not as good as TF-IDF, but still superior to Word2Vec. In contrast, GloVe has a poorer recall but a comparatively higher precision.
    - Observation: GloVe's performance indicates that although it is superior to Word2Vec in capturing semantic associations, it is not as good as TF-IDF in providing context-specific understanding for this specific dataset.

### Insights about the dataset based on the feature extraction techniques:
- **Best Overall Method:** In terms of overall accuracy and balance between precision, recall, and F1-score, TF-IDF performs better than GloVe and Word2Vec. This suggests that, for the data collected, Word2Vec or GloVe's semantic word associations are not as predictive of sentiment as TF-IDF's capture of the value of individual terms.
- **Dataset Specificity:** Word2Vec and GloVe's performance indicates that the collected data may contain unique terminology or certain contextual nuances that these algorithms are not completely capturing. This may occur in datasets that contain slang, specialised terminology, or inventive language use.

In [33]:
finalHeadline_df

Unnamed: 0,Headline,Label,Processed_Headline
1,warren buffett is now richer than mark zuckerb...,negative,warren buffett richer mark zuckerberg tech tit...
2,car loans could be the next subprime crisis th...,negative,car loan could next subprim crisi thank republ...
3,how bonds ate the entire financial system a v...,negative,bond ate entir financi system short wild histo...
4,china produces more automobiles than the us ja...,neutral,china produc automobil u japan south korea ger...
5,biden defends his handling of the economy amid...,neutral,biden defend handl economi amid latest rough i...
...,...,...,...
9982,deutsche bank ceo warns recession is inevitabl...,negative,deutsch bank ceo warn recess inevit say german...
9983,president biden to join uaw picket line and fi...,positive,presid biden join uaw picket line fix u economi
9984,americans are fed up with billionaires washing...,positive,american fed billionair washington need get we...
9985,where to start,neutral,start


# 4.	Model Selection and Training 

In [34]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, finalHeadline_df['Label'], test_size=0.2, random_state=42)

# Initialize and train the Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Evaluate the classifier
y_pred = nb_classifier.predict(X_test)
print("Naive Bayes Classifier Performance:")
print(classification_report(y_test, y_pred))


Naive Bayes Classifier Performance:
              precision    recall  f1-score   support

    negative       0.67      0.84      0.75       700
     neutral       0.75      0.65      0.69       658
    positive       0.75      0.65      0.70       640

    accuracy                           0.71      1998
   macro avg       0.72      0.71      0.71      1998
weighted avg       0.72      0.71      0.71      1998



In [35]:
from sklearn.svm import SVC

# Initialize and train the SVM Classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

# Evaluate the classifier
y_pred_svm = svm_classifier.predict(X_test)
print("SVM Classifier Performance:")
print(classification_report(y_test, y_pred_svm))


SVM Classifier Performance:
              precision    recall  f1-score   support

    negative       0.80      0.74      0.77       700
     neutral       0.71      0.85      0.77       658
    positive       0.81      0.71      0.75       640

    accuracy                           0.77      1998
   macro avg       0.77      0.77      0.76      1998
weighted avg       0.77      0.77      0.76      1998



In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

# Convert labels to categorical
y_train_cat = to_categorical(y_train.factorize()[0])
y_test_cat = to_categorical(y_test.factorize()[0])

# Neural Network Model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(256, activation='relu'))
model.add(Dense(y_train_cat.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train.toarray(), y_train_cat, epochs=10, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(X_test.toarray(), y_test_cat)
print("Neural Network Performance: Accuracy = {:.2f}".format(accuracy))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Neural Network Performance: Accuracy = 0.13


In [37]:
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test_cat, axis=1)

report = classification_report(y_true, y_pred_classes)
print(report)

              precision    recall  f1-score   support

           0       0.12      0.11      0.11       658
           1       0.13      0.13      0.13       640
           2       0.15      0.15      0.15       700

    accuracy                           0.13      1998
   macro avg       0.13      0.13      0.13      1998
weighted avg       0.13      0.13      0.13      1998



# 5.	Deployment and Interface

In [39]:
import tkinter as tk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import numpy as np

svm_classifier.classes_ = np.array(['negative', 'neutral', 'positive'])

def predict_sentiment(text):
    processed_input = preprocess_text(text) 
    vectorized_input = tfidf_vectorizer.transform([processed_input])
    prediction = svm_classifier.predict(vectorized_input)
    return prediction[0]

test_texts = [
    "I love this new phone, it has such great features!",
    "This is the worst movie I have ever seen.",
    "I'm not really sure how I feel about this new policy.",
    "What a terrible way to handle the situation!",
    "Absolutely adore the new restaurant in town!"
]

# Using the function to predict sentiment for each test text
predicted_sentiments = {text: predict_sentiment(text) for text in test_texts}

# Printing the predicted sentiments for each test text
for text, sentiment in predicted_sentiments.items():
    print(f"Text: {text}\nPredicted Sentiment: {sentiment}\n")

Text: I love this new phone, it has such great features!
Predicted Sentiment: positive

Text: This is the worst movie I have ever seen.
Predicted Sentiment: negative

Text: I'm not really sure how I feel about this new policy.
Predicted Sentiment: positive

Text: What a terrible way to handle the situation!
Predicted Sentiment: negative

Text: Absolutely adore the new restaurant in town!
Predicted Sentiment: neutral



In [None]:
import tkinter as tk
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Function to preprocess and predict sentiment
def predict_sentiment():
    user_input = text_input.get("1.0", "end-1c")  
    processed_input = preprocess_text(user_input)  
    vectorized_input = tfidf_vectorizer.transform([processed_input])  
    prediction = svm_classifier.predict(vectorized_input)  
    result_label.config(text="Predicted Sentiment: " + str(prediction[0]))  

# Tkinter window
root = tk.Tk()
root.title("Social Media Sentiment Analysis")

# Text input widget
text_input = tk.Text(root, height=5, width=40)
text_input.pack()

# Predict button
predict_button = tk.Button(root, text="Predict Sentiment", command=predict_sentiment)
predict_button.pack()

# Label to display the result
result_label = tk.Label(root, text="Predicted Sentiment: ")
result_label.pack()

# Run the application
root.mainloop()
