Load workspace - we need this to be able to download the data

In [2]:
# load data
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
#from minisom import MiniSom Jupyter Notebook 
# standard packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import warnings
import sys
# packages for natural language processing
import gensim 
!pip install minisom
!pip install nltk
from minisom import MiniSom
import string
from gensim.models.doc2vec import Doc2Vec
import nltk
from nltk import RegexpTokenizer
from nltk.tokenize.casual import TweetTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

%matplotlib inline

In [0]:
pd.get_option('max_colwidth')
pd.set_option('max_colwidth', 2000)

Load Data

In [0]:
data =pd.read_csv('/content/gdrive/My Drive/out.csv', sep=',') #.iloc[:, 1:]
# rename first column, which contains date
data=data.rename(columns = {'Unnamed: 0':'date'})
data

Delete special characters from tweets

In [0]:
data['tweet'] = data['tweet'].apply(lambda x: re.sub('([\.\",\(\)!\?;:])[!@#$:+).;,?&]1234567890/', '', x.lower()))
data['tweet'] = data['tweet'].apply(lambda x: re.sub('  ', ' ', x))
data['tweet'][1]

Some summary statistics

In [0]:
data['word_count'] = data['tweet'].str.count(' ') + 1
# words per tweet
data.groupby('party')['word_count'].mean()

In [0]:
# words per tweet and politician
data.groupby('real')['word_count'].mean()

In [0]:
#Posts per Politician
data['real'].value_counts()

In [0]:
stopwords.words('german')[:50]

Clean data for Doc2Vec:
Remove stop words, links, usernames
**To do: remove hashtags (?) and rt**

In [0]:
# remove stopwords
def cleaning(data):
  print("start cleaning vector")
  
  df = data
  # remove stopwords from data
  stop = set(stopwords.words('german', 'english')) 
  df['newtweet'] = df['tweet'].str.split()
  df['newtweet'] = df['newtweet'].apply(lambda x : [item for item in x if item not in stop])
  df["newtweet"]= df["newtweet"].str.join(" ") 
  
  # remove @usernames
  df['newtweet'] = df['newtweet'].apply(lambda x : re.sub(r'@[A-Za-z0-9]+','',x) )
  
  # remove urls
  df['newtweet'] = df['newtweet'].str.replace('http\S+|www.\S+', '', case=False)
  print("finished cleaning vector")
  return(df)

In [0]:
df = cleaning(data)
df

In [0]:
# save data
df.to_csv('/content/gdrive/My Drive/data_cleaned.csv', sep=',', encoding='utf-8', index=True)

Tag Data: helps to identify semantic structures in sentences, necessary for our doc2vec training
source: https://medium.com/@mishra.thedeepak/doc2vec-simple-implementation-example-df2afbbfbad5

Train our Doc2Vec model.
The model uses a neural network to depict similarity between sentences (in our case between tweets) as a numerical vector, which is exactly what we need as input for our SOM!

**To Do: research on how to choose parameters**



In [0]:
def trainmodel(cleanedtweets, max_epochs, vec_size, alpha , modelname):
  
  # tag data to identify semantic structures
  tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), 
                                tags=[str(i)]) for i, _d in 
                 enumerate(data['newtweet'])]

  model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
  model.build_vocab(tagged_data)
  
  for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

  model.save(str(modelname))
  print("Model" + str(modelname) +"Saved")

In [0]:
mydata = data['newtweet']
trainmodel(cleanedtweets = mydata, max_epochs = 100, vec_size = 30, alpha = 0.0025, modelname = "modelbasic")

In [0]:
# join tweets over all columns 
col = []
names = pd.unique(data['real'])
for x in range(0,len(names)):
 subset = data[data['real'] == names[x]]
 all = subset['newtweet'].str.cat(sep=' ')
 col.append({'name': names[x], 'tweets' :  all })

aggdata  = pd.DataFrame(col)

In [0]:
aggdata

In [0]:
aggmydata = aggdata['tweets']
trainmodel(cleanedtweets = aggmydata, max_epochs = 100, vec_size = 30, alpha = 0.0025, modelname = "aggmodel")

Load our trained model

In [0]:
model= Doc2Vec.load("modelbasic")
modelagg = Doc2Vec.load("aggmodel")

In [0]:
# fetch value of vector of first tweet
print(model.docvecs['1'])


In [50]:
# number of words in our vocabulary
len(model.wv.vocab)

7609

In [0]:
# number of trained document tags
len(model.docvecs)

In [0]:
# length of word vector
len(model.docvecs[1])

In [0]:
def assemble(data, model):
  #create indexvector to make labeling df easier
  # in case we want to change the vector size of the 
  # Doc2Vec model
  indexvec = []
  for col in range(0, len(model.docvecs[1])): 
    indexvec.append("x" + str(col+1))
  indexvec


   # assemble word vector as dataframe
  wordvector = pd.DataFrame(model.docvecs[1], [indexvec])
  for col in range(1, len(data)): 
   wordvector[col] = pd.DataFrame(model.docvecs[col], [indexvec])
  wordvector
  joint_data = pd.concat([data, wordvector.T ], axis=1, sort=False)
  return(joint_data)


In [0]:
obsb = assemble(data, model)

In [0]:
personb = assemble(aggdata, modelagg)


In [0]:
# save dfs so we can load them into our SOM code
obsb.to_csv('/content/gdrive/My Drive/data_wordvectors_obs.csv', sep=',', encoding='utf-8', index=True)
personb.to_csv('/content/gdrive/My Drive/data_wordvectors_pers.csv', sep=',', encoding='utf-8', index=True)
