In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [11]:
twitter_path="/kaggle/input/twitter-sentiment-analysis-hatred-speech/train.csv"
hate_speech_path="/kaggle/input/hate-speech-and-offensive-language-dataset/labeled_data.csv"

In [12]:
twitter_df=pd.read_csv(twitter_path)
hate_df=pd.read_csv(hate_speech_path)

In [13]:
twitter_df.head()


In [14]:
twitter_df["label"].unique()

In [15]:
twitter_df.drop("id", axis=1, inplace=True)

In [16]:
import seaborn as sns
sns.countplot('label',data=twitter_df)

In [17]:
hate_df.head()

In [18]:
hate_df.drop(["Unnamed: 0","count",  "hate_speech", "offensive_language", "neither"], axis=1, inplace=True)

In [19]:
sns.countplot("class", data=hate_df)

In [20]:
hate_df["class"] = hate_df["class"].replace({0:1, 2:0})

In [21]:
hate_df.head()

In [22]:
hate_df["class"].unique()

In [23]:
sns.countplot("class", data=hate_df) #0 is non offensice and 1 is offensive

In [24]:
hate_df=hate_df.rename(columns={"class": "label"})

In [25]:
twitter_df.head()

In [26]:
hate_df.head()

In [27]:
# twitter_df.concat(hate_df)
df=pd.concat([twitter_df, hate_df])

In [28]:
df.head()

In [29]:
sns.countplot("label", data=df)

In [30]:
df.shape

In [31]:
df["label"].isnull().sum()

In [32]:
df.iloc[15600]['tweet']

In [33]:
# testing = df.iloc[503]['tweet'].decode("utf-8-sig")
# testing

In [34]:
import re
from nltk.corpus import stopwords
import string
from nltk.stem.snowball import SnowballStemmer
snow_stemmer = SnowballStemmer(language='english')
# from nltk.tokenize import WordPunctTokenizer
# tok = WordPunctTokenizer()

In [35]:
print(stopwords.words('english'))

In [36]:
def clean(text):
    text=re.sub(r'@[A-Za-z0-9]+', "", text) #removes mentions
    text=re.sub(r'https?://[A-Za-z0-9./]+', "", text) #removes links
    text=re.sub("[^a-zA-Z]", " ", text) # removes #
    text = re.sub('\n', '', text) # removes new line
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # remove punctuation marks
    text=re.sub(' +', ' ', text) # remove more than on espace
    text= text.lower()
    text=[word for word in text.split(" ") if word not in stopwords.words('english') ] #removes stopwrds
    text=" ".join(text)
    text=[snow_stemmer.stem(word) for word in text.split(" ")] # gets root of word
    text=" ".join(text)
    return text

In [37]:
n=6561
print(df.iloc[n]['tweet'])
print(clean(df.iloc[n]['tweet']))

In [38]:
df["tweet"]=df["tweet"].apply(clean)

In [39]:
df.head()

In [40]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

In [41]:
text = df.loc[df['label']==0, ['tweet']].values  

wordcloud = WordCloud().generate(str(text))

plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [42]:
X=df['tweet']
y=df['label']

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [44]:
from tensorflow import keras

In [45]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding,SpatialDropout1D
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential

In [59]:
max_words=50000 #No of distinct words allowed
max_len=300 #max len of tweet
tokenizer=Tokenizer(num_words=max_words)


In [60]:
tokenizer.fit_on_texts(X_train)
sequences=tokenizer.texts_to_sequences(X_train)

In [137]:
print(sequences[0])

In [48]:
sequence_matrix=sequence.pad_sequences(sequences, maxlen=max_len)

In [49]:
model=Sequential()
model.add(Embedding(max_words, 100, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))
model.summary()


In [50]:
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])
# binary crossentropy is for 0,1 output. calculates the distance of value from 0/1.
# RMSprop - moving average of the gradient i.e closer values have more weight on the gradient than further back

In [51]:
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint

stop=EarlyStopping(monitor="val_accuracy",
                  patience = 5,
                  mode="max")
checkpoint=ModelCheckpoint(monitor="val_accuracy",
                          mode="max",
                          save_weights_only=True,
                          filepath="./",
                          save_best_only=True)

In [52]:
history=model.fit(sequence_matrix, y_train, batch_size=1024, epochs=10, validation_split=0.2, callbacks=[stop, checkpoint])

In [53]:
model.save("sentiment analysis.h5")

In [61]:
test_sequences=tokenizer.texts_to_sequences(X_test)
test_sequence_matrix=sequence.pad_sequences(test_sequences, maxlen=max_len)
model.evaluate(test_sequence_matrix, y_test)

In [92]:
prediction = model.predict(test_sequence_matrix)
res=[]
for preds in prediction:
    if preds<0.5:
        res.append(0)
    else:
        res.append(1)


In [127]:
X_list=X_test.tolist()

In [128]:
y_list=y_test.tolist()

In [130]:
k=0
for p in range(len(res)):
    if res[p] != y_list[p]:
        print(X_list[p], y_list[p])
        k=k+1
    if k==5:
        break;

In [168]:
text="I love all people"
text=clean(text)
text=[text]
print(text)
text_sequence=tokenizer.texts_to_sequences(text)
print(text_sequence)
text_sequence_matrix=sequence.pad_sequences(text_sequence, maxlen=max_len)


In [166]:
prediction=model.predict(text_sequence_matrix)

In [167]:
print(prediction)