# Importing dependencies

In [1]:
import numpy as np
import pandas as pd
import math
import re
from bs4 import BeautifulSoup

import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

# Data preprocessing

In [8]:
# data loading
cols = ['sentiment', 'id', 'date', 'query', 'user', 'text']
train_data = pd.read_csv("data/train.csv", names=cols, encoding='latin1', engine='python')
test_data = pd.read_csv("data/test.csv", names=cols, encoding='latin1', engine='python')
test_data.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...


In [9]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   sentiment  1600000 non-null  int64 
 1   id         1600000 non-null  int64 
 2   date       1600000 non-null  object
 3   query      1600000 non-null  object
 4   user       1600000 non-null  object
 5   text       1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [10]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498 entries, 0 to 497
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  498 non-null    int64 
 1   id         498 non-null    int64 
 2   date       498 non-null    object
 3   query      498 non-null    object
 4   user       498 non-null    object
 5   text       498 non-null    object
dtypes: int64(2), object(4)
memory usage: 23.5+ KB


In [11]:
train_data[train_data.text == 1]

Unnamed: 0,sentiment,id,date,query,user,text


In [12]:
# cleaning
train_data.drop(['id', 'date', 'query', 'user'], axis=1, inplace=True)
test_data.drop(['id', 'date', 'query', 'user'], axis=1, inplace=True)

In [13]:
def clean_tweets(tweet):
    tweet = BeautifulSoup(tweet, 'lxml').get_text()
    # Removing @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Removing urls
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # keep only letters
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Removing additional whitespace
    tweet = re.sub(r" +", " ", tweet)
    return tweet

In [14]:
# train_data['clean_text'] = train_data.text.apply(clean_tweets)
# test_data['clean_text'] = test_data.text.apply(clean_tweets)

In [15]:
clean_data = [clean_tweets(tweet) for tweet in train_data.text]



In [17]:
# clean_data

In [18]:
train_data[train_data.sentiment == 4] = 1
test_data[test_data.sentiment == 4] = 1

In [19]:
train_data.sentiment.unique()

array([0, 1], dtype=int64)

In [20]:
# Tokenzation
# clean_text = list(train_data.clean_text)
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    clean_data, target_vocab_size=2**16
)
data_inputs = [tokenizer.encode(sentence) for sentence in clean_data]

# Model Building

# Application