In [3]:
import pandas as pd
import numpy as np
import json
import re
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [4]:
# 載入數據
df_identity = pd.read_csv('/content/data_identification.csv')
df_emotion = pd.read_csv('/content/emotion.csv')
df_tweet = pd.read_json('/content/tweets_DM.json',lines=True)

In [5]:
df_tweet_source = df_tweet[['_source']]
print(df_tweet_source.head())

                                             _source
0  {'tweet': {'hashtags': ['Snapchat'], 'tweet_id...
1  {'tweet': {'hashtags': ['freepress', 'TrumpLeg...
2  {'tweet': {'hashtags': ['bibleverse'], 'tweet_...
3  {'tweet': {'hashtags': [], 'tweet_id': '0x1cd5...
4  {'tweet': {'hashtags': [], 'tweet_id': '0x2de2...


In [6]:
df_tweet['_source_str'] = df_tweet['_source'].apply(str)
df_tweet['_source_split'] = df_tweet['_source_str'].apply(lambda x: x.split(','))
print(df_tweet[['_source_split']].head())

                                       _source_split
0  [{'tweet': {'hashtags': ['Snapchat'],  'tweet_...
1  [{'tweet': {'hashtags': ['freepress',  'TrumpL...
2  [{'tweet': {'hashtags': ['bibleverse'],  'twee...
3  [{'tweet': {'hashtags': [],  'tweet_id': '0x1c...
4  [{'tweet': {'hashtags': [],  'tweet_id': '0x2d...


In [7]:
df_tweet['_source_cleaned'] = df_tweet['_source_str'].apply(lambda x: x.split(',', 1)[-1] if ',' in x else x)
print(df_tweet[['_source_cleaned']].head(20))

                                      _source_cleaned
0    'tweet_id': '0x376b20', 'text': 'People who p...
1    'TrumpLegacy', 'CNN'], 'tweet_id': '0x2d5350'...
2    'tweet_id': '0x28b412', 'text': 'Confident of...
3    'tweet_id': '0x1cd5b0', 'text': 'Now ISSA is ...
4    'tweet_id': '0x2de201', 'text': '"Trust is no...
5    'LaughOutLoud'], 'tweet_id': '0x1d755c', 'tex...
6    'tweet_id': '0x2c91a8', 'text': 'Still waitin...
7    'tweet_id': '0x368e95', 'text': 'Love knows n...
8    'tweet_id': '0x249c0c', 'text': '@DStvNgCare ...
9    'money', 'possessions'], 'tweet_id': '0x21844...
10   'gender', 'diversity'], 'tweet_id': '0x359db9...
11   'tweet_id': '0x23b037', 'text': "I love suffe...
12   'tweet_id': '0x1fde89', 'text': 'Can someone ...
13   'ecology'], 'tweet_id': '0x37a0a9', 'text': '...
14   'tweet_id': '0x269112', 'text': "My brother d...
15   'tweet_id': '0x360665', 'text': 'On a scale o...
16   'evatech', 'bendingcomposite', 'inovarsandton...
17   'tweet_id': '0x25be54',

In [8]:
import re

def extract_tweet_info(source_str):
    # Regular expression to find 'tweet_id' and 'text' and their values
    tweet_id_match = re.search(r"'tweet_id':\s*'([^']*)'", source_str)
    text_match = re.search(r"'text':\s*'([^']*)'", source_str)

    # Extract 'tweet_id' and 'text' if found
    tweet_id = tweet_id_match.group(0) if tweet_id_match else None
    text = text_match.group(0) if text_match else None

    # Combine the results (only if both are found)
    return ', '.join(filter(None, [tweet_id, text]))

# Apply the function to the '_source' column
df_tweet['_source_cleaned'] = df_tweet['_source_str'].apply(extract_tweet_info)

# Display the first few rows with the cleaned data
print(df_tweet[['_source_cleaned']].head())

                                     _source_cleaned
0  'tweet_id': '0x376b20', 'text': 'People who po...
1  'tweet_id': '0x2d5350', 'text': '@brianklaas A...
2  'tweet_id': '0x28b412', 'text': 'Confident of ...
3  'tweet_id': '0x1cd5b0', 'text': 'Now ISSA is s...
4  'tweet_id': '0x2de201', 'text': '"Trust is not...


In [9]:
# Extract tweet_id and text into separate columns using regular expressions
df_tweet['tweet_id'] = df_tweet['_source_cleaned'].apply(lambda x: re.search(r"'tweet_id':\s*'([^']*)'", x).group(1) if re.search(r"'tweet_id':\s*'([^']*)'", x) else None)
df_tweet['text'] = df_tweet['_source_cleaned'].apply(lambda x: re.search(r"'text':\s*'([^']*)'", x).group(1) if re.search(r"'text':\s*'([^']*)'", x) else None)
df_tweet = df_tweet.drop(columns=['_source'])

# Display the first few rows to verify the result
print(df_tweet[['tweet_id', 'text']].head())

   tweet_id                                               text
0  0x376b20  People who post "add me on #Snapchat" must be ...
1  0x2d5350  @brianklaas As we see, Trump is dangerous to #...
2  0x28b412  Confident of your obedience, I write to you, k...
3  0x1cd5b0                Now ISSA is stalking Tasha 😂😂😂 <LH>
4  0x2de201  "Trust is not the same as faith. A friend is s...


In [10]:
# merge the datasets
df_identity['tweet_id'] = df_identity['tweet_id'].str.strip()
df_emotion['tweet_id'] = df_emotion['tweet_id'].str.strip()
df_tweet['tweet_id'] = df_tweet['tweet_id'].str.strip()
df_merged = pd.merge(df_identity, df_emotion, on='tweet_id', how='outer')
df_merged = pd.merge(df_merged, df_tweet, on='tweet_id', how='outer')

df_train = df_merged[df_merged['identification'] == 'train']
df_test = df_merged[df_merged['identification'] == 'test']

In [11]:
# Drop unnecessary columns
columns_to_drop = ['_crawldate', '_index', '_type', '_source_str', '_source_split', '_source_cleaned']
df_train = df_train.drop(columns=columns_to_drop, axis=1)
df_test = df_test.drop(columns=columns_to_drop, axis=1)


In [12]:
# clean NaN
df_train['text'] = df_train['text'].fillna("")
df_test['text'] = df_test['text'].fillna("")


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


# 1. TF-IDF for feature extraction
BOW_vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.7
)

BOW_vectorizer.fit(df_train['text'])
X_train = BOW_vectorizer.transform(df_train['text'])
X_test = BOW_vectorizer.transform(df_test['text'])

# 2. convert the label
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['emotion'])

print(f"X_test shape: {X_test.shape}")
print(f"Y_train shape: {y_train.shape}")
print(f"X_train shape: {X_train.shape}")

X_test shape: (411972, 5000)
Y_train shape: (1455563,)
X_train shape: (1455563, 5000)


In [None]:
clf = LogisticRegression(max_iter=1000, solver='saga', penalty='elasticnet', l1_ratio=0.5, n_jobs=-1)
clf.fit(X_train, y_train)


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import numpy as np


try:
    y_pred = clf.predict(X_test)

except Exception as e:
    print("Error during prediction:", e)
    print("Ignoring problematic rows.")

    # exclude the data that casue error
    valid_indices = np.array([i for i in range(X_test.shape[0]) if i not in problematic_indices])

    y_pred = clf.predict(X_test[valid_indices])

y_pred_classes = label_encoder.inverse_transform(y_pred)

df_test['predicted_emotion'] = y_pred_classes
df_test.to_csv('test_predictions.csv', index=False)


In [16]:
df_pred = pd.read_csv('test_predictions.csv')
print(df_pred.head())

   tweet_id identification  emotion  _score  \
0  0x1c7f0f           test      NaN      62   
1  0x1c7f12           test      NaN     756   
2  0x1c7f13           test      NaN     213   
3  0x1c7f17           test      NaN     603   
4  0x1c7f18           test      NaN     609   

                                                text predicted_emotion  
0                                                NaN               joy  
1                                                NaN               joy  
2  The only “big plan” you ever had in your life,...               joy  
3  Looking back on situations old & new, recent o...      anticipation  
4  @jasoninthehouse Why do you insist on talking ...           sadness  


In [17]:
import pandas as pd

df = pd.read_csv('test_predictions.csv')

# keep the 'tweet_id' and 'predicted_emotion' only
df_pred = df[['tweet_id', 'predicted_emotion']]

# change column name
df_pred.columns = ['id', 'emotion']


df_pred.to_csv('test_predictions_reduced.csv', index=False)


print(df_pred.head())



         id       emotion
0  0x1c7f0f           joy
1  0x1c7f12           joy
2  0x1c7f13           joy
3  0x1c7f17  anticipation
4  0x1c7f18       sadness
