
# Comparing Twitter behavior of German politicians  before and after the European Election 2019 using Self-Organizing Maps
 
**1. Translate Tweets into numerical data
     using Doc2Vec**

Student Project on Self-Organizing Maps 

---


Authors: Clara Hoffmann & Oliver Becker





In [0]:
# set workspace
# we worked in google colab so you might
# need to change this line to your working
# directory
from google.colab import drive
drive.mount('/content/drive')

In [0]:
# set path where you saved the data
# Clara
path = "/content/drive/My Drive/"
# Oliver
#path = "/content/drive/My Drive/"

In [0]:
# standard packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import warnings
import sys
import datetime

# packages for natural language processing
import gensim 
!pip install nltk
import string
from gensim.models.doc2vec import Doc2Vec
import nltk
from nltk import RegexpTokenizer
from nltk.tokenize.casual import TweetTokenizer
from nltk.corpus import stopwords
from six import string_types
nltk.download('stopwords')
nltk.download('punkt')
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

%matplotlib inline

**Data preparation**

Subset to 4 weeks before and after the election. Clean from stopwords, usernames, urls and special characters.

In [0]:
# set column range wide so we can see the more of the tweets
pd.get_option('max_colwidth')
pd.set_option('max_colwidth', 2000)

In [0]:
# load the data
databig =pd.read_csv((path + 'out.csv'), sep=',') #.iloc[:, 1:]
# rename first column, which contains date
databig=databig.rename(columns = {'Unnamed: 0':'date'})
databig # show data

In [0]:
# convert date to string
databig['date'] = databig['date'].apply(str)
# convert to datetime format for subsetting
databig['date'] = databig['date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

# subset to before election at 6 in the afternoon
filter_pre = (databig['date'] < '2019-05-26 18:00:00') & (databig['date'] >= '2019-04-26 18:00:00')
data_pre = databig.loc[filter_pre, :]
data_pre = data_pre.reset_index(drop=True)
data_pre['time'] = 'pre'

# subset to one month after the election at 6 in the afternoon
filter_post = (databig['date'] < '2019-06-26 18:00:00') & (databig['date'] >= '2019-05-26 18:00:00')
data_post = databig.loc[filter_post, :]
data_post = data_post.reset_index(drop=True)
data_post['time'] = 'post'

In [0]:
# combine pre and post election data
data = data_pre.append(data_post)
data

In [0]:
# delete special characters from tweets
data['tweet'] = data['tweet'].apply(lambda x: re.sub('([\.\",\(\)!\?;:])[!@#$:+).;,?&]1234567890/', '', x.lower()))
data['tweet'] = data['tweet'].apply(lambda x: re.sub('  ', ' ', x))
data['tweet']

In [0]:
# Some summary statistics:
# 1. words per tweet
data['word_count'] = data['tweet'].str.count(' ') + 1
data.groupby('party')['word_count'].mean()

In [0]:
# 2. words per tweet and politician
data.groupby('real')['word_count'].mean()

In [0]:
#3. Posts per Politician
data['real'].value_counts()

In [0]:
# next we delete all stopwords 
# this is a standard procedure to only
# use the words that carry a lot of meaning
# in the Doc2Vec model
stopwords.words('german')[:50]

In [0]:
# function to remove stopwords, @username and urls
def cleaning(data):
  print("start cleaning vector")
  
  df = data
  # remove stopwords from data
  stop = set(stopwords.words('german', 'english')) 
  df['newtweet'] = df['tweet'].str.split()
  df['newtweet'] = df['newtweet'].apply(lambda x : [item for item in x if item not in stop])
  df["newtweet"]= df["newtweet"].str.join(" ") 
  
  # remove @usernames
  df['newtweet'] = df['newtweet'].apply(lambda x : re.sub(r'@[A-Za-z0-9]+','',x) )
  
  # remove urls
  df['newtweet'] = df['newtweet'].str.replace('http\S+|www.\S+', '', case=False)
  print("finished cleaning vector")
  return(df)

In [0]:
# clean data
df = cleaning(data)
df['newtweet']

In [0]:
# save data
df.to_csv((path + 'data_cleaned.csv'), sep=',', encoding='utf-8', index=True)

**Training the Doc2Vec model**



Train our Doc2Vec model. The model uses a neural network to depict similarity between sentences (in our case between tweets) as a numerical vector, which is exactly what we need as input for our SOM. 




In [0]:
# define function to train Doc2Vec model
def trainmodel(cleanedtweets, max_epochs, vec_size, alpha , modelname):
  
  # tag data to identify semantic structures
  tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), 
                                tags=[str(i)]) for i, _d in 
                 enumerate(cleanedtweets)]

  model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
  model.build_vocab(tagged_data)
  
  for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

  model.save(str(modelname))
  print("Model" + str(modelname) +"Saved")

In [0]:
# join tweets over all columns 
col = []
names = pd.unique(data['real'])
time = pd.unique(data['time'])
for y in time:
 subset_time = data[data['time'] == y]
 for x in range(0,len(names)):
  subset_name = subset_time[subset_time['real'] == names[x]]
  all = subset_name['newtweet'].str.cat(sep=' ')
  col.append({'name': names[x], 'tweets' :  all, 'time': y })

aggdata  = pd.DataFrame(col)
aggdata

In [0]:
# train our model where one observation is pre or post of a politician's assembly of tweet
aggmydata = aggdata['tweets']
trainmodel(cleanedtweets = aggmydata, max_epochs = 100, vec_size = 200, alpha = 0.0025, modelname = "aggmodel")

In [0]:
# load trained model
modelagg = Doc2Vec.load("aggmodel")

In [0]:
# Some summaries to get an impression of what our model looks like
# fetch value of vector of first tweet
print(modelagg.docvecs['3'])


In [0]:
# number of words in our vocabulary
len(modelagg.wv.vocab)

In [0]:
# number of trained document tags
len(modelagg.docvecs)

In [0]:
# function to assemble our model back into a dataframe
# that we can use as input for our SOM
def assemble(data, model):
  # create indexvector to make labeling df easier
  # in case we want to change the vector size of the 
  # Doc2Vec model
  indexvec = []
  for col in range(0, len(model.docvecs[1])): 
    indexvec.append("x" + str(col+1))
  indexvec


   # assemble word vector as dataframe
  wordvector = pd.DataFrame(model.docvecs[0], [indexvec])
  for col in range(0, len(data)): 
   wordvector[col] = pd.DataFrame(model.docvecs[col], [indexvec])
  wordvector
  joint_data = pd.concat([data, wordvector.T ], axis=1, sort=False)
  indexvec2 = data.columns.tolist()
  for col in range(0, len(indexvec)): 
     indexvec2.append(indexvec[col])
  joint_data.columns = indexvec2
  return(joint_data)


In [0]:
# assemble data and check whether it looks fine
personb = assemble(aggdata, modelagg)
personb

In [0]:
# save dataframe 
personb.to_csv((path + 'data_wordvectors_pers.csv'), sep=',', encoding='utf-8', index=True)
# save data for PCA 
from google.colab import files
personb.to_csv('forPCA.csv') 
files.download('forPCA.csv')