In [1]:
import pandas as pd

In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/manojdannana/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/manojdannana/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
import string

In [6]:
df_reddit = pd.read_csv("reddit_worldnews_sentiments_clean.csv")
df_heads = pd.read_csv("headlines_with_sentiment.csv")

In [7]:
df_reddit.head()

Unnamed: 0,Title,Num_Comments,Upvotes,Downvotes,Upvote_Ratio,Date_Posted,Flair,Post_Category,Top_Comment_Score,Sentiment_Label
0,An anti-gay Hungarian politician has resigned ...,849,204547,0,0.93,01/12/20 18:15,,Top,7555.0,negative
1,Trump Impeached for Abuse of Power,879,202909,0,0.88,19/12/19 1:23,Trump,Top,5150.0,negative
2,Vladimir Putin's black belt revoked by interna...,798,200149,0,0.89,28/02/22 20:45,,Top,2907.0,neutral
3,"Two weeks before his inauguration, Donald J. T...",914,189352,0,0.84,19/07/18 2:06,,Top,249.0,positive
4,"Queen Elizabeth II has died, Buckingham Palace...",905,189025,0,0.79,08/09/22 17:32,,Top,1.0,negative


In [8]:
df_heads.head()

Unnamed: 0,headline,Sentiment_Label
0,Over 4 Million Americans Roll Up Sleeves For O...,neutral
1,"American Airlines Flyer Charged, Banned For Li...",negative
2,23 Of The Funniest Tweets About Cats And Dogs ...,positive
3,The Funniest Tweets From Parents This Week (Se...,positive
4,Woman Who Called Cops On Black Bird-Watcher Lo...,negative


In [9]:
df = df_reddit[["Title","Sentiment_Label"]].copy()

In [10]:
df.head()

Unnamed: 0,Title,Sentiment_Label
0,An anti-gay Hungarian politician has resigned ...,negative
1,Trump Impeached for Abuse of Power,negative
2,Vladimir Putin's black belt revoked by interna...,neutral
3,"Two weeks before his inauguration, Donald J. T...",positive
4,"Queen Elizabeth II has died, Buckingham Palace...",negative


In [11]:
df_heads.columns = ["Title", "Sentiment_Label"]

In [12]:
df = pd.concat([df, df_heads.copy()], ignore_index=True)

In [13]:
df.head()

Unnamed: 0,Title,Sentiment_Label
0,An anti-gay Hungarian politician has resigned ...,negative
1,Trump Impeached for Abuse of Power,negative
2,Vladimir Putin's black belt revoked by interna...,neutral
3,"Two weeks before his inauguration, Donald J. T...",positive
4,"Queen Elizabeth II has died, Buckingham Palace...",negative


In [14]:
df.shape

(211332, 2)

In [15]:
df['Title']=df['Title'].str.lower()

In [206]:
stopwords_list = stopwords.words('english')

In [207]:
STOPWORDS = set(stopwords_list)
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
df['Title'] = df['Title'].apply(lambda text: cleaning_stopwords(text))

In [208]:
english_punctuations = string.punctuation
english_punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [209]:
def cleaning_punctuations(text):
    translator = str.maketrans('', '', english_punctuations)
    return text.translate(translator)
df['Title'] = df['Title'].apply(lambda text: cleaning_punctuations(text))

In [210]:
def lemmatizer_on_text(words):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]
df['Title'] = df['Title'].apply(lambda x: lemmatizer_on_text(x))

In [211]:
tokenizer = Tokenizer()

df['Title'] = df['Title'].astype(str)

df['Title'] = df['Title'].str.lower().replace('[^\w\s]', '', regex=True)

tokenizer.fit_on_texts(df["Title"])

sequences = tokenizer.texts_to_sequences(df["Title"])

maxlen = max(len(seq) for seq in sequences) 
padded_sequences = pad_sequences(sequences, padding='post', maxlen=maxlen)

In [212]:
print(padded_sequences)

[[ 2  7  6 ...  0  0  0]
 [ 6  5 14 ...  0  0  0]
 [22  9  2 ...  0  0  0]
 ...
 [16  4  2 ...  0  0  0]
 [ 2  9 11 ...  0  0  0]
 [11 20  4 ...  0  0  0]]


In [213]:
label_encoder = LabelEncoder()

df['Sentiment_Label_enc'] = label_encoder.fit_transform(df['Sentiment_Label'])

In [214]:
df.head()

Unnamed: 0,Title,Sentiment_Label,Sentiment_Label_enc
0,a n t i g a y h u n g a r i a n p o l i t ...,negative,0
1,t r u m p i m p e a c h e d a b u s e p ...,negative,0
2,v l a d i m i r p u t i n s b l a c k b ...,neutral,1
3,t w o w e e k s i n a u g u r a t i o n ...,positive,2
4,q u e e n e l i z a b e t h i i d i e d ...,negative,0


## Decision Tree Classifier

In [215]:
X = padded_sequences
y = df["Sentiment_Label_enc"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = DecisionTreeClassifier()

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

In [216]:
print(accuracy)

0.38632976080630277


## Random Forest Classifier

In [217]:
clf = RandomForestClassifier()

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

In [218]:
print(accuracy)

0.4320628386211465


## Logistic Regression Classifier

In [219]:
# clf = SVC()

# clf.fit(X_train, y_train)

# y_pred = clf.predict(X_test)

# accuracy = accuracy_score(y_test, y_pred)

In [220]:
# print(accuracy)

In [221]:
model = LogisticRegression(max_iter=1000, solver='sag', C=0.1, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)



In [222]:
print(accuracy)

0.4036955544514633


In [223]:
X.shape, y.shape

((211332, 238), (211332,))

In [229]:
inputs = tf.keras.layers.Input(name='inputs',shape=[238])
layer = tf.keras.layers.Embedding(500,50)(inputs) 
layer = tf.keras.layers.LSTM(64)(layer) 
layer = tf.keras.layers.Dense(256,name='FC1')(layer) 
layer = tf.keras.layers.Activation('relu')(layer) 
layer = tf.keras.layers.Dropout(0.5)(layer) 
layer = tf.keras.layers.Dense(1,name='out_layer')(layer) 
layer = tf.keras.layers.Activation('sigmoid')(layer) 
model = tf.keras.models.Model(inputs=inputs,outputs=layer) 

In [230]:
model.summary()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [231]:
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1)

Epoch 1/10
[1m2378/2378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 79ms/step - accuracy: 0.3871 - loss: -51.4270 - val_accuracy: 0.3842 - val_loss: -262.4805
Epoch 2/10
[1m2378/2378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 79ms/step - accuracy: 0.3861 - loss: -580.1841 - val_accuracy: 0.3842 - val_loss: -894.7036
Epoch 3/10
[1m2378/2378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 78ms/step - accuracy: 0.3867 - loss: -1650.0503 - val_accuracy: 0.3842 - val_loss: -1813.4479
Epoch 4/10
[1m2378/2378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 79ms/step - accuracy: 0.3885 - loss: -3002.4058 - val_accuracy: 0.3842 - val_loss: -2998.1064
Epoch 5/10
[1m2378/2378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 78ms/step - accuracy: 0.3868 - loss: -4473.2944 - val_accuracy: 0.3842 - val_loss: -4519.5913
Epoch 6/10
[1m  65/2378[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:54[0m 76ms/step - accuracy: 0.3672 - loss: -6952.5845

KeyboardInterrupt: 

In [232]:
loss, accuracy = model.evaluate(X_test, y_test)

[1m1321/1321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 11ms/step - accuracy: 0.3855 - loss: -5600.2441


In [233]:
model1 = tf.keras.Sequential([
    tf.keras.layers.Dense(1, activation='sigmoid', input_shape=(238,))
])

# Compile the model
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model1.fit(X, y, epochs=50, batch_size=32, verbose=2)

# Evaluate the model
loss, accuracy = model1.evaluate(X, y)
print(f'Loss: {loss}, Accuracy: {accuracy}')

Epoch 1/50
6605/6605 - 1s - 212us/step - accuracy: 0.3853 - loss: -1.0092e+01
Epoch 2/50
6605/6605 - 1s - 183us/step - accuracy: 0.3853 - loss: -2.9728e+01
Epoch 3/50
6605/6605 - 1s - 183us/step - accuracy: 0.3853 - loss: -4.9490e+01
Epoch 4/50
6605/6605 - 1s - 183us/step - accuracy: 0.3853 - loss: -6.9031e+01
Epoch 5/50
6605/6605 - 1s - 182us/step - accuracy: 0.3853 - loss: -8.8500e+01
Epoch 6/50
6605/6605 - 1s - 184us/step - accuracy: 0.3853 - loss: -1.0820e+02
Epoch 7/50
6605/6605 - 1s - 183us/step - accuracy: 0.3853 - loss: -1.2770e+02
Epoch 8/50
6605/6605 - 1s - 182us/step - accuracy: 0.3853 - loss: -1.4733e+02
Epoch 9/50
6605/6605 - 1s - 182us/step - accuracy: 0.3853 - loss: -1.6710e+02
Epoch 10/50
6605/6605 - 1s - 182us/step - accuracy: 0.3853 - loss: -1.8666e+02
Epoch 11/50
6605/6605 - 1s - 183us/step - accuracy: 0.3853 - loss: -2.0668e+02
Epoch 12/50
6605/6605 - 1s - 186us/step - accuracy: 0.3853 - loss: -2.2616e+02
Epoch 13/50
6605/6605 - 1s - 182us/step - accuracy: 0.3853 - 

In [17]:
df.to_csv("cumilative_headlines.csv", index=False)