In [1]:
from google.colab import drive
import pandas as pd
import numpy as np

Load Data

In [19]:
drive.mount('/content/gdrive/')
# Read the data
data_identification = pd.read_csv("/content/gdrive/MyDrive/gideon/data_identification.csv")
data_emotion = pd.read_csv("/content/gdrive/MyDrive/gideon/emotion.csv")

# Merge data_identification and data_emotion based on tweet id
merged_data = pd.merge(data_identification, data_emotion, on=['tweet_id'], how='left')
merged_data.head()

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


Unnamed: 0,tweet_id,identification,emotion
0,0x28cc61,test,
1,0x29e452,train,joy
2,0x2b3819,train,joy
3,0x2db41f,test,
4,0x2a2acc,train,trust


In [20]:
import json
import pandas as pd

# Specify the path to your JSON file
file_path = '/content/gdrive/MyDrive/gideon/tweets_DM.json'

# Initialize lists to store data
scores = []
hashtags_list = []
tweet_ids = []
texts = []
dates = []

# Read the JSON data from the file
with open(file_path, 'r') as file:
    data_lines = file.readlines()

# Process each line
for line in data_lines:
    try:
        # Parse the JSON using a custom object hook
        data = json.loads(line, object_hook=lambda d: {k: v if not isinstance(v, list) or len(v) > 0 else None for k, v in d.items()})

        # Extract the required fields
        score = data.get('_score', None)
        hashtags = data.get('_source', {}).get('tweet', {}).get('hashtags', None)
        tweet_id = data.get('_source', {}).get('tweet', {}).get('tweet_id', None)
        text = data.get('_source', {}).get('tweet', {}).get('text', None)
        date = data.get('_crawldate', None)

        # Append data to lists
        scores.append(score)
        hashtags_list.append(hashtags)
        tweet_ids.append(tweet_id)
        texts.append(text)
        dates.append(date)
    except json.JSONDecodeError:
        # Handle invalid JSON entries
        print(f"Skipping invalid JSON entry: {line.strip()}")

# Create a Pandas DataFrame
df = pd.DataFrame({
    'score': scores,
    'hashtag': hashtags_list,
    'tweet_id': tweet_ids,
    'text': texts,
    'date': dates
})

merged_data = pd.merge(merged_data, df, on=['tweet_id'], how='left')


In [21]:
merged_data.head()

Unnamed: 0,tweet_id,identification,emotion,score,hashtag,text,date
0,0x28cc61,test,,107,,@Habbo I've seen two separate colours of the e...,2017-01-17 14:13:32
1,0x29e452,train,joy,809,,Huge Respect🖒 @JohnnyVegasReal talking about l...,2015-01-17 03:07:03
2,0x2b3819,train,joy,808,"[spateradio, app]",Yoooo we hit all our monthly goals with the ne...,2016-07-02 09:34:06
3,0x2db41f,test,,728,,@FoxNews @KellyannePolls No serious self respe...,2015-10-17 06:46:20
4,0x2a2acc,train,trust,16,,@KIDSNTS @PICU_BCH @uhbcomms @BWCHBoss Well do...,2016-08-15 18:18:39


Preprocessing

In [22]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# Function to preprocess text
stop_words = set(nltk.corpus.stopwords.words('english'))
def preprocess_text(text):
    # Tokenize and convert to lowercase
    tokens = nltk.tokenize.word_tokenize(text.lower())

    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]

    # Remove special characters and numbers
    tokens = [token for token in tokens if token.isalpha()]

    return tokens

# Apply the preprocessing function to the 'text' column
merged_data['tokenized_text'] = merged_data['text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [23]:
from sklearn.preprocessing import LabelEncoder

# Encode emotion to numerical values
label_encoder = LabelEncoder()

merged_data['emotion'] = label_encoder.fit_transform(merged_data['emotion'])

In [24]:
merged_data.to_pickle("/content/gdrive/MyDrive/gideon/merged_tokenized_data.pkl")

In [2]:
drive.mount('/content/gdrive/')
# Read the data
merged_data = pd.read_pickle("/content/gdrive/MyDrive/gideon/merged_tokenized_data.pkl")

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [3]:
merged_data.head()

Unnamed: 0,tweet_id,identification,emotion,score,hashtag,text,date,tokenized_text
0,0x28cc61,test,8,107,,@Habbo I've seen two separate colours of the e...,2017-01-17 14:13:32,"[habbo, seen, two, separate, colours, elegant,..."
1,0x29e452,train,4,809,,Huge Respect🖒 @JohnnyVegasReal talking about l...,2015-01-17 03:07:03,"[huge, johnnyvegasreal, talking, losing, dad, ..."
2,0x2b3819,train,4,808,"[spateradio, app]",Yoooo we hit all our monthly goals with the ne...,2016-07-02 09:34:06,"[yoooo, hit, monthly, goals, new, app, two, we..."
3,0x2db41f,test,8,728,,@FoxNews @KellyannePolls No serious self respe...,2015-10-17 06:46:20,"[foxnews, kellyannepolls, serious, self, respe..."
4,0x2a2acc,train,7,16,,@KIDSNTS @PICU_BCH @uhbcomms @BWCHBoss Well do...,2016-08-15 18:18:39,"[kidsnts, uhbcomms, bwchboss, well, done, team..."


In [4]:
# merged_data['hashtag'].fillna("0").apply(list)
merged_data.drop(columns=["hashtag", "date", "text"], inplace=True, axis=1)
# del merged_data["hashtag"]
# del merged_data["date"]
# del merged_data["text"]

In [5]:
merged_data.dtypes

tweet_id          object
identification    object
emotion            int64
score              int64
tokenized_text    object
dtype: object

In [5]:
# # Load google word vectorizer
# from gensim.models import KeyedVectors

# drive.mount('/content/gdrive/')
# model_path = "/content/gdrive/MyDrive/gideon/GoogleNews-vectors-negative300.bin.gz"
# w2v_google_model = KeyedVectors.load_word2vec_format(model_path, binary=True)
# print('load ok')

import gensim.downloader as api

glove_twitter_25_model = api.load("glove-twitter-25")
print('load ok')

load ok


In [6]:
# # Convert to word vectors
# merged_data["word_vectors"] = merged_data["tokenized_text"].apply(lambda tokenized_list: [w2v_google_model[word] for word in tokenized_list if word in w2v_google_model])

# Assume merged_data is your DataFrame with 'tokenized_text' column
merged_data["word_vectors"] = merged_data["tokenized_text"].apply(
    lambda tokenized_list: np.mean([glove_twitter_25_model[word] for word in tokenized_list if word in glove_twitter_25_model], axis=0)
)

# If a document has no words in the GloVe model, fill with zeros (or handle differently based on your use case)
merged_data["word_vectors"] = merged_data["word_vectors"].apply(lambda vec: np.zeros_like(vec) if np.all(np.isnan(vec)) else vec)

  return _methods._mean(a, axis=axis, dtype=dtype,


In [8]:
merged_data.head()

Unnamed: 0,tweet_id,identification,emotion,score,tokenized_text,word_vectors
0,0x28cc61,test,8,107,"[habbo, seen, two, separate, colours, elegant,...","[[1.0601, 0.023332, 0.90514, -0.30314, -0.3204..."
1,0x29e452,train,4,809,"[huge, johnnyvegasreal, talking, losing, dad, ...","[[-0.64788, 0.37416, -0.64518, -0.17781, 0.728..."
2,0x2b3819,train,4,808,"[yoooo, hit, monthly, goals, new, app, two, we...","[[-0.6619, 0.9804, 0.030923, 0.33417, -0.40252..."
3,0x2db41f,test,8,728,"[foxnews, kellyannepolls, serious, self, respe...","[[1.2358, 1.4961, -0.29579, -1.0837, -1.5759, ..."
4,0x2a2acc,train,7,16,"[kidsnts, uhbcomms, bwchboss, well, done, team...","[[-0.075826, 0.22199, -0.1119, -0.61915, -0.69..."


In [7]:
merged_data.drop(columns=["tokenized_text"], inplace=True, axis=1)
# del merged_data["identification"]
# del merged_data["tweet_id"]
# del merged_data["tokenized_text"]

In [8]:
# Separate merged_data to training and test data
train_data = merged_data[merged_data['identification'] == 'train']
test_data = merged_data[merged_data['identification'] == 'test']
train_data.drop(['identification'], axis=1, inplace=True)
test_data.drop(['identification'], axis=1, inplace=True)
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

# Save and drop tweet_id
train_tweet_id = train_data['tweet_id']
test_tweet_id = test_data['tweet_id']
train_data.drop(['tweet_id'], axis=1, inplace=True)
test_data.drop(['tweet_id'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.drop(['identification'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.drop(['identification'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.drop(['tweet_id'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t

In [9]:
len(train_data)

1455563

In [10]:
# Find the maximum length of 'word_vectors'
max_length = train_data['word_vectors'].apply(lambda x: len(np.array(x).flatten())).max()

print("Maximum length of 'word_vectors':", max_length)

Maximum length of 'word_vectors': 25


In [11]:
# Calculate lengths of 'word_vectors'
train_data['length'] = train_data['word_vectors'].apply(lambda x: len(np.array(x).flatten()))

# Find and print the 20 largest lengths
largest_lengths = train_data.nlargest(30, 'length')

print(largest_lengths['length'])

0     25
1     25
2     25
3     25
4     25
5     25
6     25
7     25
8     25
9     25
10    25
11    25
12    25
13    25
14    25
15    25
16    25
17    25
18    25
19    25
20    25
21    25
22    25
23    25
24    25
25    25
26    25
27    25
28    25
29    25
Name: length, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['length'] = train_data['word_vectors'].apply(lambda x: len(np.array(x).flatten()))


In [13]:
# # Flatten the 'word_vectors' column
# flat_vectors = train_data['word_vectors'].apply(pd.Series).stack().reset_index(level=1, drop=True)

# Flatten the 'word_vectors' column
flat_vectors = pd.DataFrame(train_data['word_vectors'].apply(lambda x: np.array(x).flatten()).tolist())

In [15]:
# flat_vectors.drop(columns=[i for i in range(775, 925)], inplace=True)

In [14]:
flat_vectors.shape

(1455563, 25)

In [15]:
print(flat_vectors.head())
# Reset the index of flat_vectors
flat_vectors = flat_vectors.reset_index(drop=True)
print(flat_vectors.head())

         0         1         2         3         4         5         6   \
0 -0.239156  0.478082 -0.028842  0.170074 -0.080897  0.270186  1.569743   
1  0.378303  0.802614  0.118444 -0.409916  0.402002 -0.076872  0.438551   
2  0.141647  0.658575 -0.074542 -0.447934 -0.214162  0.101417  1.027847   
3 -0.187056  0.650943 -0.547361 -0.533652 -0.380929 -0.222686  0.186410   
4 -0.458293  0.356902 -0.314386  0.411954 -0.410917 -0.396513  1.062308   

         7         8         9   ...        15        16        17        18  \
0 -0.313173 -0.672079  0.013337  ...  0.084838  0.153812 -0.471247 -0.023905   
1 -0.210774  0.027494 -0.269742  ...  0.007873  0.375602 -0.344965 -0.194918   
2  0.344352 -0.261323  0.127377  ... -0.062229  0.455992 -0.735958 -0.384858   
3 -0.703887  0.262093 -0.255643  ...  0.430306  0.740374 -0.551486  0.481326   
4 -0.586283 -0.176416 -0.201900  ... -0.165525  0.096322  0.004865  0.162163   

         19        20        21        22        23        24  
0 -0

In [20]:
flat_vectors.isna().sum(axis=0)

0       0
1     744
2     744
3     744
4     744
5     744
6     744
7     744
8     744
9     744
10    744
11    744
12    744
13    744
14    744
15    744
16    744
17    744
18    744
19    744
20    744
21    744
22    744
23    744
24    744
dtype: int64

In [21]:
flat_vectors.fillna(0, inplace=True)

In [22]:
flat_vectors.isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
dtype: int64

In [23]:
# Concatenate flat_vectors with train_data
train_data = pd.concat([train_data, flat_vectors], axis=1).drop('word_vectors', axis=1)

# Display the resulting DataFrame
print(train_data)

         emotion  score  length         0         1         2         3  \
0              4    809      25 -0.239156  0.478082 -0.028842  0.170074   
1              4    808      25  0.378303  0.802614  0.118444 -0.409916   
2              7     16      25  0.141647  0.658575 -0.074542 -0.447934   
3              4    768      25 -0.187056  0.650943 -0.547361 -0.533652   
4              1     70      25 -0.458293  0.356902 -0.314386  0.411954   
...          ...    ...     ...       ...       ...       ...       ...   
1455558        2    361      25 -0.229692  0.289221 -0.079855  0.036754   
1455559        5     15      25  0.100302  0.469010 -0.114853 -0.429373   
1455560        4    174      25 -0.591558  0.204634 -0.551633  0.473667   
1455561        7    515      25 -0.163860  0.448487  0.124058 -0.041117   
1455562        7    850      25  0.107990  0.618000 -0.159146  0.343975   

                4         5         6  ...        15        16        17  \
0       -0.080897  0.27

In [24]:
train_data.head()

Unnamed: 0,emotion,score,length,0,1,2,3,4,5,6,...,15,16,17,18,19,20,21,22,23,24
0,4,809,25,-0.239156,0.478082,-0.028842,0.170074,-0.080897,0.270186,1.569743,...,0.084838,0.153812,-0.471247,-0.023905,-0.379791,0.273411,0.350186,-0.045849,-0.029948,-0.45318
1,4,808,25,0.378303,0.802614,0.118444,-0.409916,0.402002,-0.076872,0.438551,...,0.007873,0.375602,-0.344965,-0.194918,-0.0504,-0.578334,-0.407883,0.256118,0.275443,-0.460774
2,7,16,25,0.141647,0.658575,-0.074542,-0.447934,-0.214162,0.101417,1.027847,...,-0.062229,0.455992,-0.735958,-0.384858,-0.049885,-0.090653,0.115923,-0.070305,0.407485,-0.459018
3,4,768,25,-0.187056,0.650943,-0.547361,-0.533652,-0.380929,-0.222686,0.18641,...,0.430306,0.740374,-0.551486,0.481326,-0.08418,-0.35302,-0.064183,-0.211984,-0.02795,-0.029366
4,1,70,25,-0.458293,0.356902,-0.314386,0.411954,-0.410917,-0.396513,1.062308,...,-0.165525,0.096322,0.004865,0.162163,-0.327495,-0.14229,0.436686,0.024385,0.22318,0.055374


In [25]:
train_data.dtypes

emotion      int64
score        int64
length       int64
0          float64
1          float64
2          float64
3          float64
4          float64
5          float64
6          float64
7          float64
8          float64
9          float64
10         float64
11         float64
12         float64
13         float64
14         float64
15         float64
16         float64
17         float64
18         float64
19         float64
20         float64
21         float64
22         float64
23         float64
24         float64
dtype: object

In [26]:
test_data.isna().sum()

emotion         0
score           0
word_vectors    0
dtype: int64

Building and Tuning Model

In [27]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.metrics import f1_score

In [28]:
train_data.head()

Unnamed: 0,emotion,score,length,0,1,2,3,4,5,6,...,15,16,17,18,19,20,21,22,23,24
0,4,809,25,-0.239156,0.478082,-0.028842,0.170074,-0.080897,0.270186,1.569743,...,0.084838,0.153812,-0.471247,-0.023905,-0.379791,0.273411,0.350186,-0.045849,-0.029948,-0.45318
1,4,808,25,0.378303,0.802614,0.118444,-0.409916,0.402002,-0.076872,0.438551,...,0.007873,0.375602,-0.344965,-0.194918,-0.0504,-0.578334,-0.407883,0.256118,0.275443,-0.460774
2,7,16,25,0.141647,0.658575,-0.074542,-0.447934,-0.214162,0.101417,1.027847,...,-0.062229,0.455992,-0.735958,-0.384858,-0.049885,-0.090653,0.115923,-0.070305,0.407485,-0.459018
3,4,768,25,-0.187056,0.650943,-0.547361,-0.533652,-0.380929,-0.222686,0.18641,...,0.430306,0.740374,-0.551486,0.481326,-0.08418,-0.35302,-0.064183,-0.211984,-0.02795,-0.029366
4,1,70,25,-0.458293,0.356902,-0.314386,0.411954,-0.410917,-0.396513,1.062308,...,-0.165525,0.096322,0.004865,0.162163,-0.327495,-0.14229,0.436686,0.024385,0.22318,0.055374


In [29]:
X_train = train_data.drop('emotion', axis=1)
y_train = train_data['emotion'].astype('int32')

In [33]:
X_train.head()

Unnamed: 0,score,length,0,1,2,3,4,5,6,7,...,15,16,17,18,19,20,21,22,23,24
0,809,25,-0.239156,0.478082,-0.028842,0.170074,-0.080897,0.270186,1.569743,-0.313173,...,0.084838,0.153812,-0.471247,-0.023905,-0.379791,0.273411,0.350186,-0.045849,-0.029948,-0.45318
1,808,25,0.378303,0.802614,0.118444,-0.409916,0.402002,-0.076872,0.438551,-0.210774,...,0.007873,0.375602,-0.344965,-0.194918,-0.0504,-0.578334,-0.407883,0.256118,0.275443,-0.460774
2,16,25,0.141647,0.658575,-0.074542,-0.447934,-0.214162,0.101417,1.027847,0.344352,...,-0.062229,0.455992,-0.735958,-0.384858,-0.049885,-0.090653,0.115923,-0.070305,0.407485,-0.459018
3,768,25,-0.187056,0.650943,-0.547361,-0.533652,-0.380929,-0.222686,0.18641,-0.703887,...,0.430306,0.740374,-0.551486,0.481326,-0.08418,-0.35302,-0.064183,-0.211984,-0.02795,-0.029366
4,70,25,-0.458293,0.356902,-0.314386,0.411954,-0.410917,-0.396513,1.062308,-0.586283,...,-0.165525,0.096322,0.004865,0.162163,-0.327495,-0.14229,0.436686,0.024385,0.22318,0.055374


In [30]:
# XGBoost
param_grid = {
    'n_estimators': [100, 300, 600],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.2, 0.3],
    'reg_lambda': [0, 10, 20],
}

In [None]:
optimal_params = GridSearchCV(estimator=xgb.XGBClassifier(objective='multi:softmax', num_class=8, enable_categorical=True),
                            param_grid=param_grid,
                            cv=StratifiedShuffleSplit(test_size=0.2, n_splits=1, random_state=0),
                            scoring='f1_macro',
                            return_train_score=True,
                            verbose=3
                            ).fit(X_train, y_train)

print("Best parameters for XGB Model:", optimal_params.best_params_)
print("Score using the best parameters:", optimal_params.best_score_)

# y_pred = optimal_params.predict(X_val)
# print('Validation F1-Score =', f1_score(y_val, y_pred))

Fitting 1 folds for each of 108 candidates, totalling 108 fits
[CV 1/1] END learning_rate=0.05, max_depth=4, n_estimators=100, reg_lambda=0;, score=(train=0.224, test=0.224) total time=  50.3s
[CV 1/1] END learning_rate=0.05, max_depth=4, n_estimators=100, reg_lambda=10;, score=(train=0.223, test=0.223) total time=  49.1s
[CV 1/1] END learning_rate=0.05, max_depth=4, n_estimators=100, reg_lambda=20;, score=(train=0.223, test=0.223) total time=  50.9s
[CV 1/1] END learning_rate=0.05, max_depth=4, n_estimators=300, reg_lambda=0;, score=(train=0.270, test=0.267) total time= 2.3min
[CV 1/1] END learning_rate=0.05, max_depth=4, n_estimators=300, reg_lambda=10;, score=(train=0.269, test=0.266) total time= 2.3min
[CV 1/1] END learning_rate=0.05, max_depth=4, n_estimators=300, reg_lambda=20;, score=(train=0.269, test=0.266) total time= 2.3min
[CV 1/1] END learning_rate=0.05, max_depth=4, n_estimators=600, reg_lambda=0;, score=(train=0.291, test=0.284) total time= 4.3min
[CV 1/1] END learning_r

Training Model

In [None]:
xgb_model = xgb.XGBClassifier(objective='binary:logistic', enable_categorical=True,
                              reg_lambda=20,
                              learning_rate=0.05,
                              max_depth=4,
                              n_estimators=600)
xgb_model.fit(X_train, y_train)

In [None]:
train_data.head()

Predict test data

In [None]:
 y_preds = xgb_model.predict(test_data)

Create submission dataframe

In [None]:
class_mapping = {0: 'anger', 1: 'anticipation', 2: 'disgust', 3: 'fear',
                 4: 'sadness', 5: 'surprise', 6: 'trust', 7: 'joy'}
# Convert list of numbers to list of class names
y_preds = [class_mapping[pred] for pred in y_preds]

# Create submission dataframe
sub_df = pd.DataFrame({'tweet_id': test_tweet_id, 'prediction': y_preds})

In [None]:
# Rename the columns
sub_df.rename(columns={'tweet_id':'id', 'prediction':'emotion'}, inplace=True)
# Reset the index, removing the current index
sub_df = sub_df.reset_index(drop=True)

In [None]:
# Convert submission dataframe to csv
sub_df.to_csv("/content/gdrive/MyDrive/gideon/submission_2.csv", index=False)