# Import Library

In [1]:
import json
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load Data

In [2]:
# Read data

data = []
with open('/Users/vivian/OneDrive - NTHU/文件/清大/Data mining/DMLab2/hw/tweets_DM.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))
 
f.close()
emotion = pd.read_csv('C:/Users/vivian/OneDrive - NTHU/文件/清大/Data mining/DMLab2/hw/emotion.csv')
data_identification = pd.read_csv('C:/Users/vivian/OneDrive - NTHU/文件/清大/Data mining/DMLab2/hw/data_identification.csv')

# Build a new dataframe by selecting useful features.

In [3]:
# Filter out the 'train' data
df = pd.DataFrame(data)
_source = df['_source'].apply(lambda x: x['tweet'])
df = pd.DataFrame({
    'tweet_id': _source.apply(lambda x: x['tweet_id']),
    'hashtags': _source.apply(lambda x: x['hashtags']),
    'text': _source.apply(lambda x: x['text']),
})
df = df.merge(data_identification, on='tweet_id', how='left')

train_data = df[df['identification'] == 'train']

# Preprocess

In [4]:
train_data = train_data.merge(emotion, on='tweet_id', how='left') # Merge emotion for corresponding tweet_id
train_data.drop_duplicates(subset=['text'], keep=False, inplace=True) # Remove duplication

In [23]:
train_data_sample = train_data.sample(frac=0.1) # Get sample

In [6]:
train_data_sample

Unnamed: 0,tweet_id,hashtags,text,identification,emotion
583779,0x2d1093,[please],@ThatKevinSmith @smosier is the SMODcast you ...,train,trust
1406190,0x2edc0f,"[needfood, wantfood, givemefood]",Willing to burpee/jump squat for Jaffa Cakes r...,train,anticipation
383009,0x286121,[Power5at5],Hey @POWERATL @maddoxradio please play <LH> by...,train,sadness
1371071,0x368aff,[bathe],Does it ever dawn on people that they smell li...,train,disgust
508450,0x2c05eb,[],@JL_Baseball I was thinking contact not a hit ...,train,disgust
...,...,...,...,...,...
1447103,0x288016,[],@KingsbarnsGL what hole is the 18th for the Op...,train,sadness
279224,0x1f202d,[],"Hey @GeorgeTakei, don’t speak from a moral hig...",train,disgust
874274,0x1f8073,"[problems, to, people]","Your ""sorry"" works when you made a mistake but...",train,trust
1269672,0x23df51,[loveliveLife],Challenges and Goals for 2018! <LH> #loveliveL...,train,joy


### Droping tweet_id and identification, because it isn't going to be features used on training

In [24]:
y_train_data = train_data_sample['emotion']
X_train_data = train_data_sample.drop(['tweet_id', 'emotion', 'identification'], axis=1)
X_train_data = X_train_data['text'] + ' ' + X_train_data['hashtags'].apply(lambda x: ' '.join(x)) # Combine text and hashtags 

In [25]:
# y_train_data = train_data['emotion']
# X_train_data = train_data.drop(['tweet_id', 'emotion', 'identification'], axis=1)
# X_train_data = X_train_data['text'] + ' ' + X_train_data['hashtags'].apply(lambda x: ' '.join(x)) # Combine text and hashtags 

In [26]:
X_train_data

242688     Baby sister got her first promotion. Climbing ...
852865     Just wondering how many people follow etiquett...
865202     @CanadianPM Happy Vote @CPC_HQ 2019 Day!!!!! #...
776645     @AlexaDWilson @jtlovell1979 An elderly lady at...
604907     Judges 9:15 And the #bramble said to the #tree...
                                 ...                        
1275558    When your dad pulls through and buys chatime 👏...
646215     I'm so grateful I could burst! <LH> #gratitude...
1335474    Aliens exist. I am one, jk. But Toby McGuire h...
1429934    God wants us to always remember that He knows ...
61146      Pls follow me or be my friend I’m a nice human...
Length: 144918, dtype: object

### For the current stage, we now have combination of text and hashtags.

### 

In [10]:
from sklearn.model_selection import train_test_split
# Split training and testing data for evaluation.
X_train, X_test, y_train, y_test = train_test_split(X_train_data, y_train_data, test_size=0.2, random_state=42, stratify=y_train_data) 


In [11]:
tfidf = TfidfVectorizer(max_features=1000) # Use tfidfVectorizer and remove stop_words.
X = tfidf.fit_transform(X_train).toarray()
X_test = tfidf.transform(X_test)

In [12]:
le = LabelEncoder() # Label target
y = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [13]:
clf = RandomForestClassifier() # Use RandomForest model
clf.fit(X, y)
model = clf

In [28]:
y_pred = model.predict(X_test) # Predict

In [15]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred) # Evaluation

0.4913400496825835

# Deal with test data

### Convert test data into the form that same as train data.

In [16]:
test_data = df[df['identification'] == 'test'] # Get the test data from df

In [17]:
# Do the same thing as training stage, but here we don't have emotions feature.
X_test_data = test_data.drop(['tweet_id', 'identification'], axis=1)
X_test_data = X_test_data['text'] + ' ' + X_test_data['hashtags'].apply(lambda x: ' '.join(x))

In [18]:
X_test_data = tfidf.transform(X_test_data).toarray() # Convert test data by using same tfidfVectorizer

In [19]:
y_test_pred = model.predict(X_test_data)

In [20]:
y_pred_labels = le.inverse_transform(y_test_pred) # Inverse predict labels back to adjective words

## Make up submit format

In [21]:
submission = pd.DataFrame({
    'id': test_data['tweet_id'],
    'emotion': y_pred_labels
})

In [22]:
submission.to_csv('C:/Users/vivian/OneDrive - NTHU/文件/清大/Data mining/DMLab2/hw/submission.csv', index=False)