In [1]:
import tensorflow
from tensorflow import keras
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.datasets import imdb
from tensorflow.keras import initializers

import pandas as pd
import numpy as np

In [2]:
filepath = "data/Tweets.csv"
data = pd.read_csv(filepath)
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

In [4]:
df = data.drop(columns=['tweet_id', 'airline' , 'name', 'airline_sentiment', 'airline_sentiment_confidence', 'negativereason', 'negativereason_confidence',
                          'airline_sentiment_gold', 'negativereason_gold','tweet_coord', 'tweet_location'])

In [5]:
df.head()

Unnamed: 0,retweet_count,text,tweet_created,user_timezone
0,0,@VirginAmerica What @dhepburn said.,2015-02-24 11:35:52 -0800,Eastern Time (US & Canada)
1,0,@VirginAmerica plus you've added commercials t...,2015-02-24 11:15:59 -0800,Pacific Time (US & Canada)
2,0,@VirginAmerica I didn't today... Must mean I n...,2015-02-24 11:15:48 -0800,Central Time (US & Canada)
3,0,@VirginAmerica it's really aggressive to blast...,2015-02-24 11:15:36 -0800,Pacific Time (US & Canada)
4,0,@VirginAmerica and it's a really big bad thing...,2015-02-24 11:14:45 -0800,Pacific Time (US & Canada)


In [6]:
df.dtypes

retweet_count     int64
text             object
tweet_created    object
user_timezone    object
dtype: object

In [7]:
df['tweet_created'] = pd.to_datetime(df['tweet_created'])
df.dtypes

retweet_count                                     int64
text                                             object
tweet_created    datetime64[ns, pytz.FixedOffset(-480)]
user_timezone                                    object
dtype: object

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype                                 
---  ------         --------------  -----                                 
 0   retweet_count  14640 non-null  int64                                 
 1   text           14640 non-null  object                                
 2   tweet_created  14640 non-null  datetime64[ns, pytz.FixedOffset(-480)]
 3   user_timezone  9820 non-null   object                                
dtypes: datetime64[ns, pytz.FixedOffset(-480)](1), int64(1), object(2)
memory usage: 457.6+ KB


In [9]:
df['user_timezone'].value_counts()

Eastern Time (US & Canada)    3744
Central Time (US & Canada)    1931
Pacific Time (US & Canada)    1208
Quito                          738
Atlantic Time (Canada)         497
                              ... 
Saskatchewan                     1
Pretoria                         1
America/Atikokan                 1
America/Detroit                  1
Irkutsk                          1
Name: user_timezone, Length: 85, dtype: int64

In [10]:
df['user_timezone'].isna().sum()

4820

In [11]:
df['user_timezone'].mode()[0]

'Eastern Time (US & Canada)'

In [12]:
df['user_timezone'].fillna(df['user_timezone'].mode()[0], inplace=True)

In [13]:
df['user_timezone'].value_counts()

Eastern Time (US & Canada)    8564
Central Time (US & Canada)    1931
Pacific Time (US & Canada)    1208
Quito                          738
Atlantic Time (Canada)         497
                              ... 
Saskatchewan                     1
Pretoria                         1
America/Atikokan                 1
America/Detroit                  1
Irkutsk                          1
Name: user_timezone, Length: 85, dtype: int64

In [14]:
data['tweet_location'].value_counts()

Boston, MA                        157
New York, NY                      156
Washington, DC                    150
New York                          127
USA                               126
                                 ... 
Boston, Atlanta, or in-flight       1
pittsburgh                          1
Palo Alto, CA                       1
Greensboro NC                       1
Swooning in New York, New York      1
Name: tweet_location, Length: 3081, dtype: int64

In [15]:
#one-hot encoding user_timezone
df_user = pd.get_dummies(df['user_timezone'], columns=['user_timezone'])
df_user

Unnamed: 0,Abu Dhabi,Adelaide,Alaska,America/Atikokan,America/Boise,America/Chicago,America/Detroit,America/Los_Angeles,America/New_York,Amsterdam,...,Stockholm,Sydney,Taipei,Tehran,Tijuana,Tokyo,Vienna,Warsaw,Wellington,West Central Africa
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14635,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14636,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14637,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14638,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype                                 
---  ------         --------------  -----                                 
 0   retweet_count  14640 non-null  int64                                 
 1   text           14640 non-null  object                                
 2   tweet_created  14640 non-null  datetime64[ns, pytz.FixedOffset(-480)]
 3   user_timezone  14640 non-null  object                                
dtypes: datetime64[ns, pytz.FixedOffset(-480)](1), int64(1), object(2)
memory usage: 457.6+ KB


In [17]:
df = pd.concat([df, df_user], axis=1)
df

Unnamed: 0,retweet_count,text,tweet_created,user_timezone,Abu Dhabi,Adelaide,Alaska,America/Atikokan,America/Boise,America/Chicago,...,Stockholm,Sydney,Taipei,Tehran,Tijuana,Tokyo,Vienna,Warsaw,Wellington,West Central Africa
0,0,@VirginAmerica What @dhepburn said.,2015-02-24 11:35:52-08:00,Eastern Time (US & Canada),0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,@VirginAmerica plus you've added commercials t...,2015-02-24 11:15:59-08:00,Pacific Time (US & Canada),0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,@VirginAmerica I didn't today... Must mean I n...,2015-02-24 11:15:48-08:00,Central Time (US & Canada),0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,@VirginAmerica it's really aggressive to blast...,2015-02-24 11:15:36-08:00,Pacific Time (US & Canada),0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,@VirginAmerica and it's a really big bad thing...,2015-02-24 11:14:45-08:00,Pacific Time (US & Canada),0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14635,0,@AmericanAir thank you we got on a different f...,2015-02-22 12:01:01-08:00,Eastern Time (US & Canada),0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14636,0,@AmericanAir leaving over 20 minutes Late Flig...,2015-02-22 11:59:46-08:00,Eastern Time (US & Canada),0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14637,0,@AmericanAir Please bring American Airlines to...,2015-02-22 11:59:15-08:00,Eastern Time (US & Canada),0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14638,0,"@AmericanAir you have my money, you change my ...",2015-02-22 11:59:02-08:00,Eastern Time (US & Canada),0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df = df.drop(columns = ['user_timezone'])
df

Unnamed: 0,retweet_count,text,tweet_created,Abu Dhabi,Adelaide,Alaska,America/Atikokan,America/Boise,America/Chicago,America/Detroit,...,Stockholm,Sydney,Taipei,Tehran,Tijuana,Tokyo,Vienna,Warsaw,Wellington,West Central Africa
0,0,@VirginAmerica What @dhepburn said.,2015-02-24 11:35:52-08:00,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,@VirginAmerica plus you've added commercials t...,2015-02-24 11:15:59-08:00,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,@VirginAmerica I didn't today... Must mean I n...,2015-02-24 11:15:48-08:00,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,@VirginAmerica it's really aggressive to blast...,2015-02-24 11:15:36-08:00,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,@VirginAmerica and it's a really big bad thing...,2015-02-24 11:14:45-08:00,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14635,0,@AmericanAir thank you we got on a different f...,2015-02-22 12:01:01-08:00,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14636,0,@AmericanAir leaving over 20 minutes Late Flig...,2015-02-22 11:59:46-08:00,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14637,0,@AmericanAir Please bring American Airlines to...,2015-02-22 11:59:15-08:00,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14638,0,"@AmericanAir you have my money, you change my ...",2015-02-22 11:59:02-08:00,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
max_features = 20000
maxlen = 30
batch_size = 32

In [20]:
#vectorise text, one-hot encoding
sentences = [] #holds extracted sequences
next_chars = [] #holds the targets
step = 3

for i in range(0,len(df['text']) - maxlen, step):
    sentences.append(df['text'][i:i+maxlen])
    next_chars.append(df['text'][i+maxlen])
    
chars = sorted(set(df['text']))
char_indices = dict((char, chars.index(char)) for char in chars)
x = np.zeros((len(sentences), maxlen, len(chars)), dtype = np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

for i, sentence in enumerate (sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1

In [28]:
#alternative vectorization of text --> HashingVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(n_features=20)
vector = vectorizer.transform(df['text'])
vector

<14640x20 sparse matrix of type '<class 'numpy.float64'>'
	with 159310 stored elements in Compressed Sparse Row format>

In [30]:
vector.shape

(14640, 20)

In [31]:
from sklearn.model_selection import train_test_split 

X = vector
y = data['airline_sentiment']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [32]:
print('x_train:', x_train.shape, "y_train:", y_train.shape, "\n" , "x_test:", x_test.shape, "y_test:", y_test.shape)

X_train: (10248, 20) y_train: (10248,) 
 X_test: (4392, 20) y_test: (4392,)


In [37]:
x_train = sequence.pad_sequences(x_train,maxlen= maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print ('x_train shape:', x_train.shape)
print ('x_test shape:', x_test.shape)

TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]

In [110]:
x_train[123,:]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [24]:
rnn_hidden_dim = 5
word_embedding_dim = 50
model_rnn = Sequential()
model_rnn.add(Embedding(max_features, word_embedding_dim))
model_rnn.add(SimpleRNN(rnn_hidden_dim,
                       kernel_initializer =initializers.RandomNormal(stddev=0.001),
                       recurrent_initializer = initializers.Identity(gain=1.0),
                       activation='relu',
                       input_shape= x_train.shape))

model_rnn.add(Dense(1, activation='sigmoid'))

In [25]:
model_rnn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 50)          1000000   
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 5)                 280       
_________________________________________________________________
dense (Dense)                (None, 1)                 6         
Total params: 1,000,286
Trainable params: 1,000,286
Non-trainable params: 0
_________________________________________________________________


In [26]:
rmsprop = keras.optimizers.RMSprop(lr= .0001)

model_rnn.compile(loss='binary_crossentropy',
                 optimizer = rmsprop,
                 metrics=['accuracy'])

In [27]:
model_rnn.fit(x_train, y_train,
             batch_size = batch_size,
             epochs = 10,
             validation_data= (x_test, y_test))

Epoch 1/10


ValueError: in user code:

    C:\Users\WeiEn\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py:805 train_function  *
        return step_function(self, iterator)
    C:\Users\WeiEn\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\WeiEn\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\distribute\distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\WeiEn\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\distribute\distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\WeiEn\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\distribute\distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\WeiEn\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py:788 run_step  **
        outputs = model.train_step(data)
    C:\Users\WeiEn\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py:754 train_step
        y_pred = self(x, training=True)
    C:\Users\WeiEn\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\base_layer.py:1012 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    C:\Users\WeiEn\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\sequential.py:375 call
        return super(Sequential, self).call(inputs, training=training, mask=mask)
    C:\Users\WeiEn\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\functional.py:425 call
        inputs, training=training, mask=mask)
    C:\Users\WeiEn\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\functional.py:560 _run_internal_graph
        outputs = node.layer(*args, **kwargs)
    C:\Users\WeiEn\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\layers\recurrent.py:660 __call__
        return super(RNN, self).__call__(inputs, **kwargs)
    C:\Users\WeiEn\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\base_layer.py:998 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    C:\Users\WeiEn\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\input_spec.py:223 assert_input_compatibility
        str(tuple(shape)))

    ValueError: Input 0 of layer simple_rnn is incompatible with the layer: expected ndim=3, found ndim=4. Full shape received: (None, 30, 14427, 50)


In [133]:
(1,x_train.shape[0], x_train.shape[1], x_train.shape[2])

(1, 3409, 30, 14427)