In [3]:
### General definitions -----------------------------------------------------
### imports 
import csv
import os
import string
import pandas
import re
import numpy


### displays files in directory
print("Files in Directory")
listOfFileNames = os.listdir(os.getcwd()) #finds file names in directory
print(listOfFileNames)

## Word vectors from https://nlp.stanford.edu/projects/glove/
_50d_wordvec = "glove.6B.50d.txt"
_100d_wordvec = "glove.6B.100d.txt"
_300d_wordvec = "glove.6B.300d.txt"

_twitter_50d_wordvec = "glove.twitter.27B.50d.txt"

DIMENSIONALITY = 50

## Twitter files 
### all russian bot tweets
bigRussiaFile = "ira_tweets_csv_hashed.csv"    # all russian bot tweets
smallRussiaFile = "IRAhandle_tweets_1.csv" #smaller sample

# normal tweets
normalTweets = "dashboard_x_usa_x_filter_nativeretweets.csv" # normal tweets



Files in Directory
['IRAhandle_tweets_4.csv', 'IRAhandle_tweets_5.csv', 'IRAhandle_tweets_7.csv', 'glove.twitter.27B.100d.txt', 'IRAhandle_tweets_6.csv', 'IRAhandle_tweets_2.csv', '.DS_Store', 'IRAhandle_tweets_3.csv', 'IRAhandle_tweets_1.csv', 'RussianTrollData_Preprocessing.ipynb', 'glove.twitter.27B.50d.txt', 'bigboy.csv', 'nonRTs copy 2.csv', 'CSE575_project.ipynb', 'README.md', 'glove.twitter.27B.25d.txt', 'IRAhandle_tweets_12.csv', 'IRAhandle_tweets_13.csv', 'kowalski_analysis.py', 'IRAhandle_tweets_11.csv', 'glove.twitter.27B.200d.txt', 'IRAhandle_tweets_10.csv', 'russiantrolldata_preprocessing.py', '.ipynb_checkpoints', '.git', 'RussianTrollData_Preprocessing-Copy1.ipynb', 'dashboard_x_usa_x_filter_nativeretweets.csv', 'IRAhandle_tweets_8.csv', 'nonRTs.csv', 'ira_tweets_csv_hashed.csv', 'tweetData.csv', 'IRAhandle_tweets_9.csv']


In [4]:
## Creates pandas dataframe objects -----------------------------------------------------

print("Russia Tweets Info \"rtd\"")
rtd = pandas.read_csv(smallRussiaFile) 
print(type(rtd))                     
rtd_TotalRows, rtd_TotalColumns = rtd.shape
print(rtd.columns)

print("\nNormal Tweet Info \"ntd\"")
ntd = pandas.read_csv(normalTweets)
print(type(ntd))                      
ntd_TotalRows, ntd_TotalColumns = ntd.shape
print(ntd.columns)

#rtd.head(3)                                 # top 3 rows
#rtd['publish_data']                         # gives pandas.series of that column
#rtd.publish_data                            # same thing
#rtd['author','content','tempurature']       # mutiple
#rtd['external_author_id'].max()
#rtd.describe()                              #describes data
#rtd[rtd.author=="10_GOP"]                   # gives rows with author = 10_GOP

Russia Tweets Info "rtd"
<class 'pandas.core.frame.DataFrame'>
Index(['external_author_id', 'author', 'content', 'region', 'language',
       'publish_date', 'harvested_date', 'following', 'followers', 'updates',
       'post_type', 'account_type', 'retweet', 'account_category',
       'new_june_2018', 'alt_external_id', 'tweet_id', 'article_url',
       'tco1_step1', 'tco2_step1', 'tco3_step1'],
      dtype='object')

Normal Tweet Info "ntd"
<class 'pandas.core.frame.DataFrame'>
Index(['Tweet Id', 'Date', 'Hour', 'User Name', 'Nickname', 'Bio',
       'Tweet content', 'Favs', 'RTs', 'Latitude', 'Longitude', 'Country',
       'Place (as appears on Bio)', 'Profile picture', 'Followers',
       'Following', 'Listed', 'Tweet language (ISO 639-1)', 'Tweet Url'],
      dtype='object')


In [5]:
### Preprocessing Data ----------------------------------------------------------------------

## we only want English tweets
rtd = rtd[rtd['language']=='English']
ntd = ntd[ntd["Tweet language (ISO 639-1)"]=='en']

# we only want text data
rtd_tweettexts = rtd['content']
ntd_tweettexts = ntd['Tweet content']



In [6]:
## tokenize tweet function defintion ------------------------------------------------
import re
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=True):  # Tweet tokenizer
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens
 
tweet = 'RT @marcobonzanini: just an example! :D http://example.com #NLP'
print(preprocess(tweet))
print(type(preprocess(tweet)))

['rt', '@marcobonzanini', ':', 'just', 'an', 'example', '!', ':D', 'http://example.com', '#nlp']
<class 'list'>


In [29]:
### Word Vectors -----------------------------------------------------------------------------
import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

glove_wordmap = {}  # given word from 30d data, returns vector from 30D data
lmtzr = WordNetLemmatizer()
for line in open(_twitter_50d_wordvec):           
    word = line.split(' ')[0]
    vec = line.split(' ')[1:]
    glove_wordmap[word] = numpy.asarray(vec, dtype='float32')

# given word, return glove wordvec
def wordLookup(word):
    vec = glove_wordmap.get(word)
    if vec is not None:
      return vec   # returns ndarray 50x1, dtype=float32)
    else:
      return None

def tokenListVector(tokenList):
    vector_dim = len(wordLookup("test"))
    vec = numpy.zeros(vector_dim)
    
    for token in tokenList:
        wordVec = wordLookup(token)
        if wordVec is not None:
            vec = vec + wordVec
            
    return vec


[nltk_data] Downloading package wordnet to /Users/danz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [42]:
rtd_tweettexts_df = pandas.DataFrame(rtd_tweettexts)
ntd_tweettexts_df = pandas.DataFrame(ntd_tweettexts)

rtd_tweettexts_df['Normal'] = 0
rtd_tweettexts_df['Bot'] = 1
rtd_tweettexts_df['Type'] = 'Bot'

ntd_tweettexts_df['Normal'] = 1
ntd_tweettexts_df['Bot'] = 0
ntd_tweettexts_df['Type'] = 'Normal'

ntd_tweettexts_df.columns = ['content', 'Normal','Bot','Type']
print(ntd_tweettexts_df.columns)
print(rtd_tweettexts_df.columns)

all_tweettexts_df = ntd_tweettexts_df.append(rtd_tweettexts_df)
all_tweettexts_df.tail

Index(['content', 'Normal', 'Bot', 'Type'], dtype='object')
Index(['content', 'Normal', 'Bot', 'Type'], dtype='object')


<bound method NDFrame.tail of                                                   content  Normal  Bot    Type
0       Wind 3.2 mph NNE. Barometer 30.20 in, Rising s...       1    0  Normal
2       Good. Morning. #morning #Saturday #diner #VT #...       1    0  Normal
3       @gratefuldead recordstoredayus 🌹🌹🌹 @ TOMS MUSI...       1    0  Normal
4       Egg in a muffin!!! (@ Rocket Baby Bakery - @ro...       1    0  Normal
5       @lyricwaters should've gave the neighbor  a bu...       1    0  Normal
6       On the way to CT! (@ Mamaroneck, NY in Mamaron...       1    0  Normal
7       We're #hiring! Read about our latest #job open...       1    0  Normal
8       Me... @ Montgomery Scrap Corporation https://t...       1    0  Normal
9       BAYADA Home Health Care: Home Health Registere...       1    0  Normal
10      Shift Supervisor Trainee - CVS Health: (#OCEAN...       1    0  Normal
12      Although I am not endorsing a candidate, I tho...       1    0  Normal
13      I think spring

In [181]:
all_tweettexts_df = pandas.read_csv('tweetData.csv')
all_tweettexts_df = all_tweettexts_df.sample(frac=1)


In [182]:
## create word Vector dataset

def tweet2vec(tweettext):
    tokens = preprocess(tweettext)
    vec = tokenListVector(tokens)
    return vec

for x in range(0,30):
    all_tweettexts_df['d'+str(x)] = 0

vecMatrix = numpy.zeros((len(all_tweettexts_df),50+1))
for i,row in all_tweettexts_df.iterrows():
    if row['Type'] == "Bot":
        vecMatrix[i][0] = int(1)
    else:
        vecMatrix[i][0] = int(0)
    vec = tweet2vec(row['content'])
    j = 1
    for x in vec:
        vecMatrix[i][j] = x
        j+=1

#all_tweettexts_df['vector'] = all_tweettexts_df['content'].apply(lambda x: tweet2vec(x))



In [188]:

## Preprepare data 60:20:20
test_size = .2
sample_num = numpy.size(vecMatrix,0)
numpy.random.shuffle(vecMatrix)

train_x = vecMatrix[:int(test_size*sample_num),1:]
train_y = vecMatrix[:int(test_size*sample_num),0]

test_x = vecMatrix[int(test_size*sample_num)+1:,1:]
test_y = vecMatrix[int(test_size*sample_num)+1:,0]


from sklearn import preprocessing
from sklearn import utils

lab_enc = preprocessing.LabelEncoder()
train_y = lab_enc.fit_transform(train_y)
test_y = lab_enc.fit_transform(test_y)

In [191]:
### SVM

from sklearn import svm, metrics

clf = svm.SVC(kernel='linear',gamma='auto')
clf.fit(train_x, train_y) 

pred_y = clf.predict(test_x)

print("Accuracy:",metrics.accuracy_score(test_y, pred_y))





Accuracy: 0.893349565121428


array([0, 0, 1, ..., 0, 1, 0])

array([0, 0, 0, ..., 0, 0, 0])

Index(['content', 'Normal', 'Bot', 'Type', 'vector'], dtype='object')