## Model Building

## Prepare Dataset for Modelling

In [1]:
import pandas as pd
import numpy as np
import os
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.options.display.max_colwidth = None
pd.set_option("display.float_format", lambda x: '%.2f' % x)

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
love_island_data = pd.read_csv('outputs/love_island_data.csv')
print("'love_island_data' successfully loaded...")

'love_island_data' successfully loaded...


In [3]:
love_island_data.shape

(2998, 11)

In [4]:
love_island_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2998 entries, 0 to 2997
Data columns (total 11 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Unnamed: 0.1                    2998 non-null   int64  
 1   Unnamed: 0                      2998 non-null   int64  
 2   author.properties.friends       2998 non-null   int64  
 3   author.properties.status_count  2998 non-null   float64
 4   author.properties.verified      2998 non-null   bool   
 5   content.body                    2998 non-null   object 
 6   location.country                2998 non-null   object 
 7   properties.platform             2998 non-null   object 
 8   properties.sentiment            2998 non-null   float64
 9   location.latitude               2998 non-null   float64
 10  location.longitude              2998 non-null   float64
dtypes: bool(1), float64(4), int64(3), object(3)
memory usage: 237.3+ KB


In [5]:
# Drop duplicate and irrelevant columns in dataframe
del love_island_data['Unnamed: 0.1']
del love_island_data ['Unnamed: 0']

In [6]:
love_island_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2998 entries, 0 to 2997
Data columns (total 9 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   author.properties.friends       2998 non-null   int64  
 1   author.properties.status_count  2998 non-null   float64
 2   author.properties.verified      2998 non-null   bool   
 3   content.body                    2998 non-null   object 
 4   location.country                2998 non-null   object 
 5   properties.platform             2998 non-null   object 
 6   properties.sentiment            2998 non-null   float64
 7   location.latitude               2998 non-null   float64
 8   location.longitude              2998 non-null   float64
dtypes: bool(1), float64(4), int64(1), object(3)
memory usage: 190.4+ KB


In [7]:
love_island_data.head(3)

Unnamed: 0,author.properties.friends,author.properties.status_count,author.properties.verified,content.body,location.country,properties.platform,properties.sentiment,location.latitude,location.longitude
0,1689,22566.0,False,Can't believe I'm missing Love Island 😩,GB,twitter,1.0,51.57,0.46
1,114,1377.0,False,Last tweet about future wedding..... if I actually want a wedding I actually need to find a guy XD we all know I'm a loner. unlovable,GB,twitter,1.0,52.97,-1.17
2,568,8375.0,False,"How many times does he wonna say the phrase ""i deal with shit"" #LoveIsland",GB,twitter,-1.0,51.39,0.03


## Seting the Train_Test_Split

Split is `30%` to `70%` that is, `0.3`, `0.7`

In [9]:
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor

In [10]:
X = love_island_data[["author.properties.friends",
                    "author.properties.status_count",
                    "author.properties.verified","content.body",
                    "location.country","properties.platform",
                    "properties.sentiment","location.latitude",
                    "location.longitude"]]

In [14]:
type(X)

pandas.core.frame.DataFrame

In [11]:
X.head(3)

Unnamed: 0,author.properties.friends,author.properties.status_count,author.properties.verified,content.body,location.country,properties.platform,properties.sentiment,location.latitude,location.longitude
0,1689,22566.0,False,Can't believe I'm missing Love Island 😩,GB,twitter,1.0,51.57,0.46
1,114,1377.0,False,Last tweet about future wedding..... if I actually want a wedding I actually need to find a guy XD we all know I'm a loner. unlovable,GB,twitter,1.0,52.97,-1.17
2,568,8375.0,False,"How many times does he wonna say the phrase ""i deal with shit"" #LoveIsland",GB,twitter,-1.0,51.39,0.03


In [13]:
y = love_island_data["properties.sentiment"]

In [14]:
type(y)

pandas.core.series.Series

In [15]:
y.head()

0    1.00
1    1.00
2   -1.00
3   -1.00
4    0.00
Name: properties.sentiment, dtype: float64

In [16]:
train_data, test_data = train_test_split(love_island_data,test_size = 0.30, random_state =42)

In [17]:
train_data.shape, test_data.shape

((2098, 9), (900, 9))

In [18]:
train_data.head(3)

Unnamed: 0,author.properties.friends,author.properties.status_count,author.properties.verified,content.body,location.country,properties.platform,properties.sentiment,location.latitude,location.longitude
858,2723,27039.0,False,Ain't gna stress it anymore😴,GB,twitter,-1.0,51.45,-0.98
1011,278,31474.0,False,@Wackkyyy Yes if you take your shirt off like you did in the skype call. 👀,GB,twitter,-1.0,51.6,-0.34
48,422,1083.0,False,New Music Alert https://t.co/nDKbwFcD7d,GB,twitter,1.0,51.51,-0.12


In [19]:
test_data.head(3)

Unnamed: 0,author.properties.friends,author.properties.status_count,author.properties.verified,content.body,location.country,properties.platform,properties.sentiment,location.latitude,location.longitude
1376,282,2085.0,False,@Donforester Many established 1st generation immigrants want to restrict immigration. Many 2nd generation eg Irish more welcoming,GB,twitter,0.0,53.42,-2.92
932,51,12533.0,False,@smollyalexander thank u hunty,GB,twitter,1.0,53.37,-2.17
144,931,307.0,False,Hedge removal part one...!! @ Dalkeith https://t.co/slEBFhE0w9,GB,twitter,-1.0,55.87,-3.07


## Model Building using Autogluon Tabular Predictor

In [21]:
label = 'properties.sentiment'

In [22]:
%%time

save_path = 'models/new'
predictor = TabularPredictor(label=label, path=save_path).fit(train_data)

Beginning AutoGluon training ...
AutoGluon will save models to "models/new/"
AutoGluon Version:  0.7.0
Python Version:     3.9.16
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 20.6.0: Fri Dec 16 00:35:00 PST 2022; root:xnu-7195.141.49~1/RELEASE_X86_64
Train Data Rows:    2098
Train Data Columns: 8
Label Column: properties.sentiment
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == float, but few unique label-values observed and label-values can be converted to int).
	3 unique label values:  [-1.0, 1.0, 0.0]
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Train Data Class Count: 3
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    7221.53 MB

CPU times: user 37.6 s, sys: 2.69 s, total: 40.3 s
Wall time: 26.1 s


In [46]:
predictor = TabularPredictor.load("models/new")

In [47]:
predictor.leaderboard(silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost,0.67,0.01,6.01,0.01,6.01,1,True,6
1,WeightedEnsemble_L2,0.67,0.01,6.64,0.0,0.63,2,True,11
2,XGBoost,0.65,0.01,1.57,0.01,1.57,1,True,9
3,RandomForestGini,0.65,0.09,1.32,0.09,1.32,1,True,4
4,RandomForestEntr,0.64,0.16,1.41,0.16,1.41,1,True,5
5,ExtraTreesGini,0.63,0.07,0.86,0.07,0.86,1,True,7
6,NeuralNetFastAI,0.62,0.02,4.28,0.02,4.28,1,True,3
7,ExtraTreesEntr,0.62,0.07,0.86,0.07,0.86,1,True,8
8,NeuralNetTorch,0.6,0.03,3.83,0.03,3.83,1,True,10
9,KNeighborsUnif,0.47,0.14,2.16,0.14,2.16,1,True,1


## Model Evaluation

load test data to make predictions

In [48]:
y_test = test_data[label]

In [49]:
y_test.head()

1376    0.00
932     1.00
144    -1.00
1752    1.00
51     -1.00
Name: properties.sentiment, dtype: float64

Define the values to predict

In [26]:
y_test = test_data[label]

In [27]:
y_test [0:5]

1376    0.00
932     1.00
144    -1.00
1752    1.00
51     -1.00
Name: properties.sentiment, dtype: float64

Delete label/target column to avoid cheating.

In [28]:
test_data_nolab = test_data.drop(columns=[label])

In [29]:
test_data_nolab.head(3)

Unnamed: 0,author.properties.friends,author.properties.status_count,author.properties.verified,content.body,location.country,properties.platform,location.latitude,location.longitude
1376,282,2085.0,False,@Donforester Many established 1st generation immigrants want to restrict immigration. Many 2nd generation eg Irish more welcoming,GB,twitter,53.42,-2.92
932,51,12533.0,False,@smollyalexander thank u hunty,GB,twitter,53.37,-2.17
144,931,307.0,False,Hedge removal part one...!! @ Dalkeith https://t.co/slEBFhE0w9,GB,twitter,55.87,-3.07


confirming defined save path & predictor

In [30]:
save_path

'models/new'

In [50]:
predictor

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x127f76af0>

In [34]:
save_model_predictor = TabularPredictor.load(save_path)

In [35]:
save_model_predictor

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x14dc0d790>

In [37]:
y_pred = save_model_predictor.predict(test_data_nolab)

In [38]:
y_pred[0:5]

1376   -1.00
932    -1.00
144    -1.00
1752    1.00
51     -1.00
Name: properties.sentiment, dtype: float64

In [41]:
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

Evaluation: accuracy on test data: 0.6244444444444445
Evaluations on test data:
{
    "accuracy": 0.6244444444444445,
    "balanced_accuracy": 0.5698524240516779,
    "mcc": 0.3923485132664916
}


# Predictor Leaderboard

Comparing the perfomance of autogluon models with the Predictor Leaderboard

In [42]:
save_model_predictor.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,ExtraTreesGini,0.64,0.63,0.17,0.07,0.86,0.17,0.07,0.86,1,True,7
1,RandomForestGini,0.63,0.65,0.22,0.09,1.32,0.22,0.09,1.32,1,True,4
2,ExtraTreesEntr,0.63,0.62,0.2,0.07,0.86,0.2,0.07,0.86,1,True,8
3,XGBoost,0.63,0.65,0.06,0.01,1.57,0.06,0.01,1.57,1,True,9
4,CatBoost,0.62,0.67,0.02,0.01,6.01,0.02,0.01,6.01,1,True,6
5,WeightedEnsemble_L2,0.62,0.67,0.02,0.01,6.64,0.0,0.0,0.63,2,True,11
6,RandomForestEntr,0.62,0.64,0.19,0.16,1.41,0.19,0.16,1.41,1,True,5
7,NeuralNetTorch,0.6,0.6,0.05,0.03,3.83,0.05,0.03,3.83,1,True,10
8,NeuralNetFastAI,0.58,0.62,0.05,0.02,4.28,0.05,0.02,4.28,1,True,3
9,KNeighborsUnif,0.44,0.47,0.03,0.14,2.16,0.03,0.14,2.16,1,True,1


## Feature Importance

Checking the importance of different features (columns) on the test and train data

For `test_data`...

In [43]:
save_model_predictor.feature_importance(test_data, silent=True)

These features in provided data are not utilized by the predictor and will be ignored: ['properties.platform']


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
content.body,0.24,0.02,0.0,5,0.27,0.2
location.longitude,0.01,0.0,0.01,5,0.02,-0.0
author.properties.friends,0.01,0.0,0.01,5,0.02,-0.0
author.properties.status_count,0.0,0.01,0.29,5,0.01,-0.01
location.country,0.0,0.0,0.5,5,0.0,0.0
author.properties.verified,0.0,0.0,0.5,5,0.0,0.0
location.latitude,-0.0,0.0,0.86,5,0.01,-0.01


For `train_data`...

In [45]:
save_model_predictor.feature_importance(train_data, silent=True)

These features in provided data are not utilized by the predictor and will be ignored: ['properties.platform']


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
content.body,0.37,0.0,0.0,5,0.38,0.36
author.properties.status_count,0.03,0.01,0.0,5,0.04,0.02
author.properties.friends,0.03,0.0,0.0,5,0.03,0.02
location.latitude,0.03,0.0,0.0,5,0.03,0.02
location.longitude,0.02,0.0,0.0,5,0.02,0.01
location.country,0.0,0.0,0.5,5,0.0,0.0
author.properties.verified,-0.0,0.0,0.81,5,0.0,-0.0


## Bringing it all together

In [42]:
test_data["predicted_sentiment"] = y_pred

In [43]:
test_data.head()

Unnamed: 0,author.properties.friends,author.properties.status_count,author.properties.verified,content.body,location.country,properties.platform,properties.sentiment,location.latitude,location.longitude,predicted_sentiment
1376,282,2085.0,False,@Donforester Many established 1st generation immigrants want to restrict immigration. Many 2nd generation eg Irish more welcoming,GB,twitter,0.0,53.42,-2.92,-1.0
932,51,12533.0,False,@smollyalexander thank u hunty,GB,twitter,1.0,53.37,-2.17,-1.0
144,931,307.0,False,Hedge removal part one...!! @ Dalkeith https://t.co/slEBFhE0w9,GB,twitter,-1.0,55.87,-3.07,-1.0
1752,291,1580.0,False,God I love @KevinHart4real 😂 snapchats making my night,GB,twitter,1.0,51.42,-0.45,1.0
51,680,3691.0,False,@MargevonMarge Blimey. You still haven't served enough time here? #EUref #Remain,GB,twitter,-1.0,53.55,-0.66,-1.0


In [44]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 900 entries, 1376 to 1005
Data columns (total 10 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   author.properties.friends       900 non-null    int64  
 1   author.properties.status_count  900 non-null    float64
 2   author.properties.verified      900 non-null    bool   
 3   content.body                    900 non-null    object 
 4   location.country                900 non-null    object 
 5   properties.platform             900 non-null    object 
 6   properties.sentiment            900 non-null    float64
 7   location.latitude               900 non-null    float64
 8   location.longitude              900 non-null    float64
 9   predicted_sentiment             900 non-null    float64
dtypes: bool(1), float64(5), int64(1), object(3)
memory usage: 103.5+ KB


## Using input for prediction

In [46]:
test_data["author.properties.verified"].unique()

array([False,  True])

In [47]:
test_data["properties.platform"].unique()

array(['twitter'], dtype=object)

In [48]:
test_data["properties.sentiment"].unique()

array([ 0.,  1., -1.])

In [57]:
## Creating sample input
input_data_dict = {
    'author.properties.friends': 958,
    'author.properties.status_count': 7024,
    'author.properties.verified': 'False',
    'content.body': '#LoveIsland is the biggest show in Europe right now!!',
    'location.country': 'GB',
    'location.latitude': 53.3887,
    'location.longitude': -1.4699
}

In [58]:
input_data_dict

{'author.properties.friends': 958,
 'author.properties.status_count': 7024,
 'author.properties.verified': 'False',
 'content.body': '#LoveIsland is the biggest show in Europe right now!!',
 'location.country': 'GB',
 'properties.platform': 'twitter',
 'location.latitude': 53.3887,
 'location.longitude': -1.4699}

In [59]:
input_data = pd.DataFrame([input_data_dict])

In [60]:
input_data

Unnamed: 0,author.properties.friends,author.properties.status_count,author.properties.verified,content.body,location.country,properties.platform,location.latitude,location.longitude
0,958,7024,False,#LoveIsland is the biggest show in Europe right now!!,GB,twitter,53.39,-1.47


In [61]:
save_model_predictor.predict(input_data)

0   1.00
Name: properties.sentiment, dtype: float64

In [62]:
save_model_predictor.predict(input_data)[0]

1.0

# Pushing to Streamlit 

In [56]:
import streamlit as st

In [63]:
```python
## This is going to be a Streamlit App

import streamlit as st

st.title('Love Island Online Sentiment Prediction')

author.properties.friends  = st.number_input ('number of friends', 0, 8000)
author.properties.status_count = st.number_input ('status_count', 0, 12000)
author.properties.verified = st.selectbox('verified', options=['Yes','No'])
content.body = st.text_area('post content')
location.country = st.selectbox('country', options=['GB','GG','JE','IM'])
location.longitude = st.number_input ('longitude')
location.latitude = st.number_input ('latitude')


```

SyntaxError: invalid syntax (3806686634.py, line 1)