In [1]:
import json
import pandas as pd

In [2]:
file = open("tweets_DM.json", 'r', encoding='utf-8')
papers = []
for line in file.readlines():
    dic = json.loads(line)
    papers.append(dic)
df = pd.DataFrame.from_dict(pd.json_normalize(papers), orient='columns')
df

Unnamed: 0,_score,_index,_crawldate,_type,_source.tweet.hashtags,_source.tweet.tweet_id,_source.tweet.text
0,391,hashtag_tweets,2015-05-23 11:42:47,tweets,[Snapchat],0x376b20,"People who post ""add me on #Snapchat"" must be ..."
1,433,hashtag_tweets,2016-01-28 04:52:09,tweets,"[freepress, TrumpLegacy, CNN]",0x2d5350,"@brianklaas As we see, Trump is dangerous to #..."
2,232,hashtag_tweets,2017-12-25 04:39:20,tweets,[bibleverse],0x28b412,"Confident of your obedience, I write to you, k..."
3,376,hashtag_tweets,2016-01-24 23:53:05,tweets,[],0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>
4,989,hashtag_tweets,2016-01-08 17:18:59,tweets,[],0x2de201,"""Trust is not the same as faith. A friend is s..."
...,...,...,...,...,...,...,...
1867530,827,hashtag_tweets,2015-05-12 12:51:52,tweets,"[mixedfeeling, butimTHATperson]",0x316b80,When you buy the last 2 tickets remaining for ...
1867531,368,hashtag_tweets,2017-10-02 17:54:04,tweets,[],0x29d0cb,I swear all this hard work gone pay off one da...
1867532,498,hashtag_tweets,2016-10-10 11:04:32,tweets,[],0x2a6a4f,@Parcel2Go no card left when I wasn't in so I ...
1867533,840,hashtag_tweets,2016-09-02 14:25:06,tweets,[],0x24faed,"Ah, corporate life, where you can date <LH> us..."


In [3]:
df = df.set_axis(['score', 'index', 'crawldate', 'type', 'hashtag', 'tweet_id', 'tweet_text'], axis=1)
df = df.drop(columns=['score', 'index', 'crawldate', 'type', 'hashtag'])
df

Unnamed: 0,tweet_id,tweet_text
0,0x376b20,"People who post ""add me on #Snapchat"" must be ..."
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #..."
2,0x28b412,"Confident of your obedience, I write to you, k..."
3,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>
4,0x2de201,"""Trust is not the same as faith. A friend is s..."
...,...,...
1867530,0x316b80,When you buy the last 2 tickets remaining for ...
1867531,0x29d0cb,I swear all this hard work gone pay off one da...
1867532,0x2a6a4f,@Parcel2Go no card left when I wasn't in so I ...
1867533,0x24faed,"Ah, corporate life, where you can date <LH> us..."


In [4]:
df_emotion = pd.read_csv("./emotion.csv") 
df_emotion

Unnamed: 0,tweet_id,emotion
0,0x3140b1,sadness
1,0x368b73,disgust
2,0x296183,anticipation
3,0x2bd6e1,joy
4,0x2ee1dd,anticipation
...,...,...
1455558,0x38dba0,joy
1455559,0x300ea2,joy
1455560,0x360b99,fear
1455561,0x22eecf,joy


In [5]:
df_concat=pd.merge(df, df_emotion, on='tweet_id')
df_concat

Unnamed: 0,tweet_id,tweet_text,emotion
0,0x376b20,"People who post ""add me on #Snapchat"" must be ...",anticipation
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",sadness
2,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,fear
3,0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,joy
4,0x2c91a8,Still waiting on those supplies Liscus. <LH>,anticipation
...,...,...,...
1455558,0x321566,I'm SO HAPPY!!! #NoWonder the name of this sho...,joy
1455559,0x38959e,In every circumtance I'd like to be thankful t...,joy
1455560,0x2cbca6,there's currently two girls walking around the...,joy
1455561,0x24faed,"Ah, corporate life, where you can date <LH> us...",joy


In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df_concat['emotion'])
y

array([1, 5, 3, ..., 4, 4, 4])

In [7]:
import re
from bs4 import BeautifulSoup

def preprocessor(text):
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    text = re.sub(r"[^a-zA-Z0-9]+", ' ', text)
    
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

In [8]:
# tokenizer function

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
stop = stopwords.words('english')
stop.append("utc")
def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cyc31\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import HashingVectorizer

In [10]:
vec_text = HashingVectorizer(n_features=2**6,
                            preprocessor=preprocessor,
                            tokenizer=tokenizer_stem_nostop)

In [11]:
text_ft = vec_text.fit_transform(df_concat['tweet_text'])



In [12]:
df_total = pd.DataFrame(text_ft.todense(), index=df_concat.index)
df_total

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.000000,0.000000,0.0,0.000000,0.353553,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
1,0.000000,0.000000,0.0,0.000000,0.333333,-0.333333,0.000000,0.0,0.0,-0.333333,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
2,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
3,0.000000,0.333333,0.0,0.000000,0.000000,0.000000,0.333333,0.0,0.0,0.000000,...,0.0,0.333333,0.000000,0.000000,0.0,0.0,-0.333333,0.333333,0.000000,0.0
4,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.500000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455558,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.301511,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
1455559,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.333333,0.000000,0.333333,0.0,0.0,0.000000,0.000000,0.000000,0.0
1455560,0.316228,0.316228,0.0,-0.316228,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,-0.316228,0.0
1455561,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.333333,0.000000,0.000000,0.0


In [14]:
model = RandomForestClassifier(criterion='entropy',
                                n_estimators=500,
                                max_depth=15, 
                                random_state=1,
                                n_jobs=2)
model.fit(df_total,y)
pred_scores = cross_val_score(estimator=model, X=df_total, y=y, cv=10)
print('%s: %.3f (+/-%.3f)' % ("RandomForestClassifier", pred_scores.mean(), pred_scores.std()))
# predict_y = pred_model_3.predict(test_df)
# result3 = pd.DataFrame(predict_y)

RandomForestClassifier: 0.387 (+/-0.000)


In [15]:
df_identification = pd.read_csv("./data_identification.csv") 

In [16]:
df_test=pd.merge(df, df_identification, on='tweet_id')
df_test

Unnamed: 0,tweet_id,tweet_text,identification
0,0x376b20,"People who post ""add me on #Snapchat"" must be ...",train
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",train
2,0x28b412,"Confident of your obedience, I write to you, k...",test
3,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,train
4,0x2de201,"""Trust is not the same as faith. A friend is s...",test
...,...,...,...
1867530,0x316b80,When you buy the last 2 tickets remaining for ...,test
1867531,0x29d0cb,I swear all this hard work gone pay off one da...,test
1867532,0x2a6a4f,@Parcel2Go no card left when I wasn't in so I ...,test
1867533,0x24faed,"Ah, corporate life, where you can date <LH> us...",train


In [22]:
df_test = df_test[df_test["identification"]=="test"]
df_test.reset_index(drop=True, inplace=True)
df_test = df_test.drop(columns=["identification"])
df_test

Unnamed: 0,tweet_id,tweet_text
0,0x28b412,"Confident of your obedience, I write to you, k..."
1,0x2de201,"""Trust is not the same as faith. A friend is s..."
2,0x218443,When do you have enough ? When are you satisfi...
3,0x2939d5,"God woke you up, now chase the day #GodsPlan #..."
4,0x26289a,"In these tough times, who do YOU turn to as yo..."
...,...,...
411967,0x2913b4,"""For this is the message that ye heard from th..."
411968,0x2a980e,"""There is a lad here, which hath five barley l..."
411969,0x316b80,When you buy the last 2 tickets remaining for ...
411970,0x29d0cb,I swear all this hard work gone pay off one da...


In [24]:
text_ft_test = vec_text.fit_transform(df_test["tweet_text"])
df_total_test = pd.DataFrame(text_ft_test.todense(), index=df_test.index)
df_total_test



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.000000,0.000000,0.0,0.000000,0.0,-0.408248,0.000000,0.0,0.408248,-0.408248,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
1,0.301511,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.301511,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
2,0.000000,0.000000,0.0,0.000000,0.0,0.000000,-0.316228,0.0,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,-0.632456,-0.316228,0.000000,0.0,0.000000,0.316228
3,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,-0.408248,...,0.000000,0.000000,0.0,0.408248,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
4,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411967,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,-0.333333,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.333333,0.000000
411968,0.333333,0.000000,0.0,-0.333333,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.0,0.333333,-0.333333,0.000000,0.000000,0.0,0.000000,0.000000
411969,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.316228,0.0,0.000000,0.000000,0.000000,0.632456,0.0,0.000000,0.000000
411970,0.000000,0.377964,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.377964,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000


In [25]:
predict_y = model.predict(df_total_test)
result = pd.DataFrame(predict_y)

In [30]:
result_array = result.values.T[0]
result_array

array([4, 4, 4, ..., 4, 4, 4])

In [43]:
result_df = le.inverse_transform(result_array)
result_df = pd.DataFrame(result_df)
result_df

Unnamed: 0,0
0,joy
1,joy
2,joy
3,joy
4,joy
...,...
411967,joy
411968,joy
411969,joy
411970,joy


In [44]:
result_df = pd.concat([df_test['tweet_id'], result_df], axis=1)
result_df.columns = ['id','emotion']
result_df

Unnamed: 0,id,emotion
0,0x28b412,joy
1,0x2de201,joy
2,0x218443,joy
3,0x2939d5,joy
4,0x26289a,joy
...,...,...
411967,0x2913b4,joy
411968,0x2a980e,joy
411969,0x316b80,joy
411970,0x29d0cb,joy


In [45]:
result_df.to_csv("./DM_rf_all.csv", index=False)