In [None]:
#Install the Transformer
pip install transformers



In [None]:
import numpy as np
import pandas as pd
import torch
import transformers  as ppb 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [None]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

#We can use BERT but here I am using DistillBERT because BERT requires more RAM then available in the colab,but to use BERT just uncomment the next line and comment the previous line
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)
#To run the model on GPU
#model.cuda()   

In [None]:
#Importing the dataset
import pandas as pd
 
dataset=pd.read_csv('/content/100_tweets_per_user_new.csv')

y=dataset.iloc[:,1].values

In [None]:
#splitting in testing and training
k=0
X_train=[]
y_train=[]
for i in range(0,50):
  for j in range(k,k+90):
    X_train.append(dataset.iloc[j,0])
    y_train.append(dataset.iloc[j,1])
  k+=100

In [None]:
X_train

["b'India and Benin should broaden trade ties: Tharoor - India and Benin need to broaden and deepen trade ties and look... http://ow.ly/15WSX8\\n'",
 "b'Parliament should have 100 sittings a year: Meira Kumar - Lok Sabha Speaker Meira Kumar Wednesday expressed her con... http://ow.ly/15MtWl\\n'",
 'b"PM launches Cairn India\'s oil fields in Barmer - Prime Minister Manmohan Singh Saturday formally inaugurated Cairn ... http://ow.ly/15MUVZ\\n"',
 "b'Jharkhand governor to hold additional charge of Andhra - Jharkhand Governor K. Sankaranarayanan will hold additiona... http://ow.ly/16dzh3\\n'",
 "b'British national found dead in Goa village - A 39-year-old British national was found dead in his guest house room ... http://ow.ly/15Xd1t\\n'",
 'b"Kashmiri Hindus heckle Yasin Malik at peace conference - Jammu and Kashmir\'s separatist leader Yasin Malik was heck... http://ow.ly/16joeW\\n"',
 'b\'Shazahn Padamsee loves kids.. - Shazahn Padamsee is fond of kids. "Being with kids lights up Shazah

In [None]:
k=90
X_test=[]
y_test=[]
for i in range(0,50):
  for j in range(k,k+10):
    X_test.append(dataset.iloc[j,0])
    y_test.append(dataset.iloc[j,1])
  k+=100

In [None]:
X_train=pd.DataFrame(X_train)
X_test=pd.DataFrame(X_test)
y_train=pd.DataFrame(y_train)
y_test=pd.DataFrame(y_test)
X_train=X_train.iloc[:,:].values
X_test=X_test.iloc[:,:].values
y_train=y_train.iloc[:,:].values
y_test=y_test.iloc[:,:].values

In [None]:
train=np.concatenate((X_train,y_train),axis=1)
test=np.concatenate((X_test,y_test),axis=1)

In [None]:
np.random.shuffle(train)
np.random.shuffle(test)

In [None]:
train=pd.DataFrame(train)
test=pd.DataFrame(test)

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

def preprocess(sentence):
  sentence=str(sentence)
  sentence = sentence.lower()
  sentence=sentence.replace('{html}',"") 
  cleanr = re.compile('<.*?>#@')
  cleantext = re.sub(cleanr, '', sentence)
  rem_url=re.sub(r'http\S+', '',cleantext)
  rem_num = re.sub('[0-9]+', '', rem_url)
  tokenizer = RegexpTokenizer(r'\w+')
  tokens = tokenizer.tokenize(rem_num)  
  filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
  stem_words=[stemmer.stem(w) for w in filtered_words]
  lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
  return " ".join(filtered_words)


train.iloc[:,0]=train.iloc[:,0].map(lambda s:preprocess(s))
test.iloc[:,0]=test.iloc[:,0].map(lambda s:preprocess(s))

In [None]:
X_train=train.iloc[:,0]
X_test=test.iloc[:,0]
y_train=train.iloc[:,1]
y_test=test.iloc[:,1]

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
encoder = LabelEncoder()
encoder.fit(y_train)
encoded_Y = encoder.transform(y_train)
# convert integers to dummy variables (i.e. one hot encoded)
y_train = np_utils.to_categorical(encoded_Y)

Using TensorFlow backend.


In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
encoder = LabelEncoder()
encoder.fit(y_test)
encoded_Y = encoder.transform(y_test)
# convert integers to dummy variables (i.e. one hot encoded)
y_test = np_utils.to_categorical(encoded_Y)

In [None]:
X_train=pd.DataFrame(X_train)

In [None]:
tokenized = X_train.iloc[:,0].apply((lambda x: tokenizer.encode(str(x), add_special_tokens=True)))

In [None]:

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [None]:
#for adding paddings
input_ids = torch.tensor(np.array(padded))


In [None]:
#to set the paddings to zero and rest to 1
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(5000, 44)

In [None]:
input_ids = (torch.tensor(padded))
attention_mask = (torch.tensor(attention_mask))

In [None]:
with torch.no_grad():
    last_hidden_states_train = model(input_ids,attention_mask)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(last_hidden_states_train[0],y_train, test_size = 0.05, random_state = 0)

In [None]:
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3) 

In [None]:
from keras.models import Sequential
from keras.layers import Conv1D
from keras.layers import MaxPooling1D,AveragePooling1D,Dropout
from keras.layers import Flatten
from keras.layers import Dense

In [None]:
classifier = Sequential()
classifier.add(Conv1D(filters=52, kernel_size=3, activation='relu', input_shape=(44,768)))
classifier.add(Conv1D(filters=28, kernel_size=3, activation='relu'))
#classifier.add(Dropout(0.5))
classifier.add(MaxPooling1D(pool_size=4))
classifier.add(Flatten())
classifier.add(Dense(50, activation='softmax'))
classifier.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
classifier.summary()


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_3 (Conv1D)            (None, 42, 52)            119860    
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 40, 28)            4396      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 10, 28)            0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 280)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 50)                14050     
Total params: 138,306
Trainable params: 138,306
Non-trainable params: 0
_________________________________________________________________


In [None]:
classifier.fit(np.array(X_train),np.array(y_train),batch_size=128,epochs=25,validation_data=(np.array(X_eval),np.array(y_eval)),verbose=1,callbacks=[es])

Train on 4750 samples, validate on 250 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 00013: early stopping


<keras.callbacks.callbacks.History at 0x7fc5c0e29320>

In [None]:
X_test=pd.DataFrame(X_test)
tokenized = X_test.iloc[:,0].apply((lambda x: tokenizer.encode(str(x), add_special_tokens=True)))

In [None]:

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(36-len(i)) for i in tokenized.values])

In [None]:
input_ids = torch.tensor(np.array(padded))

In [None]:
#to set the paddings to zero and rest to 1
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(250, 36)

In [None]:
#To convert the parameters to torch tensors
#input_ids = (torch.tensor(padded)).to(device)  
#attention_mask = (torch.tensor(attention_mask)).to(device)
input_ids = (torch.tensor(padded))
attention_mask = (torch.tensor(attention_mask))

In [None]:
with torch.no_grad():
    last_hidden_states_test = model(input_ids,attention_mask)

In [None]:
y_pred =  classifier.predict(last_hidden_states_test[0])

In [None]:
#TO get maximum value as 1 and rest to zero
y_pred=pd.DataFrame(y_pred)
y_pred=y_pred.eq(y_pred.where(y_pred != 0).max(1), axis=0).astype(int)
y_pred=y_pred.iloc[:,:].values

In [None]:
y_test=pd.DataFrame(y_test)
y_test=y_test.eq(y_test.where(y_test != 0).max(1), axis=0).astype(int)
y_test=y_test.iloc[:,:].values

In [None]:
result=[]
for i in range(0,len(y_test)):
  for j in range(0,len(y_test[0])):
    if(y_test[i][j]==1):
      result.append(j)


In [None]:
predicted=[]
for i in range(0,len(y_pred)):
  for j in range(0,len(y_pred[0])):
    if(y_pred[i][j]==1):
      predicted.append(j)

In [None]:
print(result)
print(predicted)

[30, 30, 23, 34, 2, 46, 15, 10, 27, 44, 4, 31, 3, 1, 36, 27, 49, 44, 33, 4, 14, 39, 27, 33, 34, 49, 42, 37, 19, 36, 15, 12, 22, 35, 27, 5, 35, 1, 33, 16, 6, 6, 29, 48, 40, 46, 19, 5, 48, 13, 33, 17, 48, 9, 25, 39, 28, 10, 7, 12, 37, 22, 0, 42, 8, 20, 43, 7, 8, 49, 29, 31, 17, 3, 38, 0, 41, 40, 47, 0, 24, 15, 46, 36, 7, 49, 17, 12, 40, 1, 44, 36, 10, 17, 7, 2, 35, 24, 24, 44, 24, 18, 13, 42, 32, 32, 14, 49, 5, 42, 41, 21, 22, 42, 16, 28, 2, 28, 29, 9, 28, 11, 25, 45, 13, 23, 3, 35, 18, 21, 43, 26, 47, 16, 23, 29, 1, 22, 2, 30, 18, 32, 39, 37, 0, 20, 38, 37, 40, 35, 25, 3, 20, 27, 13, 18, 1, 26, 11, 22, 30, 9, 24, 43, 8, 45, 9, 39, 17, 8, 46, 11, 2, 31, 44, 32, 7, 34, 10, 45, 18, 46, 9, 45, 34, 8, 30, 36, 5, 20, 6, 19, 15, 4, 16, 16, 41, 43, 12, 10, 15, 29, 4, 31, 19, 14, 11, 32, 47, 48, 26, 14, 41, 21, 14, 28, 26, 40, 3, 21, 6, 43, 20, 37, 41, 19, 0, 5, 21, 39, 12, 48, 38, 47, 13, 23, 34, 47, 31, 38, 25, 33, 26, 11, 23, 6, 45, 25, 4, 38]
[23, 18, 28, 44, 8, 46, 1, 38, 0, 1, 4, 4, 3, 11,

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(result,predicted)


In [None]:
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
print('Confusion Matrix :')
print(cm) 
print('Accuracy Score :',accuracy_score(result, predicted)) 
print('Report : ')
print(classification_report(result, predicted)) 


Confusion Matrix :
[[2 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 2 ... 0 0 0]
 ...
 [0 0 0 ... 2 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 1]]
Accuracy Score : 0.216
Report : 
              precision    recall  f1-score   support

           0       0.67      0.40      0.50         5
           1       0.20      0.20      0.20         5
           2       0.50      0.40      0.44         5
           3       0.50      0.20      0.29         5
           4       0.15      0.40      0.22         5
           5       0.00      0.00      0.00         5
           6       0.00      0.00      0.00         5
           7       0.16      0.60      0.25         5
           8       0.43      0.60      0.50         5
           9       1.00      0.40      0.57         5
          10       0.00      0.00      0.00         5
          11       0.00      0.00      0.00         5
          12       0.25      0.60      0.35         5
          13       0.38      0.60      0.46         5
          14     

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
type_here=[]
type_here.append('@appetite H1N1 UPDATE: Immunization clinics open doors to high school students - LondonTopic.ca: H1N1 UPDATE: Immunization clini..\n')
typr_here=pd.DataFrame(type_here)

In [None]:
typr_here.iloc[:,0]=typr_here.iloc[:,0].map(lambda s:preprocess(s))

In [None]:
tokenized = typr_here.iloc[:,0].apply((lambda x: tokenizer.encode(str(x), add_special_tokens=True)))

In [None]:

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(44-len(i)) for i in tokenized.values])

In [None]:
input_ids = torch.tensor(np.array(padded))

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(1, 44)

In [None]:
input_ids = (torch.tensor(padded))
attention_mask = (torch.tensor(attention_mask))

In [None]:
with torch.no_grad():
    last_hidden_states_test = model(input_ids,attention_mask)

In [None]:
y_pred = classifier.predict(last_hidden_states_test[0])


In [None]:
Xtst=last_hidden_states_test[0]

In [None]:
class_idx = np.argmax(y_pred[0]) #not needed in this case as only two classes
class_output = classifier.output[:, class_idx]
last_conv_layer = classifier.get_layer("conv1d_3")

In [None]:
grads = K.gradients(class_output, last_conv_layer.output)[0]
pooled_grads = K.mean(grads)
iterate = K.function([classifier.input], [pooled_grads, last_conv_layer.output[0]])
pooled_grads_value, conv_layer_output_value = iterate([Xtst])

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from IPython.display import HTML
from sklearn.model_selection import train_test_split
from numpy import array
from numpy import argmax
from keras.utils import to_categorical
from keras.models import Model
from keras import backend as K 
from keras.models import Sequential
from keras import layers
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer


pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)



In [None]:

heatmap = np.mean(conv_layer_output_value, axis=-1)
heatmap = np.maximum(heatmap,0)
heatmap /= np.max(heatmap)#normalise values in the prediction


In [None]:
norm_len = 36/last_conv_layer.output_shape[1] 

In [None]:
y_pred

array([[0.01182991]], dtype=float32)

In [None]:
html = ""
#if y_pred[0]>0.5:
#  pred = '90078731'
#else:
pred = '15401533'
html += "<span><h3>Based on the description, the model believes that text belongs to {} author ".format(pred)
html += "<small><br>Confidence: {:.0f}%<br><br></small></h3></span>".format(abs(((y_pred[0][0]*100)-50)*2))
for j,i in enumerate(type_here[0].split()):
  html += "<span style='background-color:rgba({},0,150,{})'>{} </span>".format(heatmap[math.floor(j/norm_len)]*255,heatmap[math.floor(j/norm_len)]-0.3,i)

HTML(html)
