In [1]:
import pandas as pd
import re
import textblob
from gensim import corpora
from gensim import similarities
import time

unable to import 'smart_open.gcs', disabling that module


In [2]:
pd.set_option('display.max_colwidth', 500)

In [3]:
tweets = pd.read_csv('../data/raw/all.csv')

In [4]:
tweets[:2]

Unnamed: 0,tweet_id,tweet_text,user_id,tweet_time,no_favs,no_retweets,urls
0,851485433003421701,#Trump’s border wall would be an ecological and financial disaster for the U.S. https://t.co/lGSc8X53Vg,613521891,2017-04-10T17:22:01.000Z,0,0,https://www.vox.com/energy-and-environment/2017/4/10/14471304/trump-border-wall-animals
1,851514117374398464,#Trump’s border wall would be an ecological and financial disaster for the U.S. https://t.co/v1TW467479,222481411,2017-04-10T19:16:00.000Z,0,0,https://www.vox.com/energy-and-environment/2017/4/10/14471304/trump-border-wall-animals


#### Identify URLs

In [5]:
re.findall(r'https?://[www\.]?\w*\.[\w/$]*', 'https://t.co/lGSc8X53Vg #Trump’s border wall https://t.co/lGSc8X53Vg would be an ecological and financial disaster for the U.S. https://t.co/lGSc8X53Vg')

['https://t.co/lGSc8X53Vg',
 'https://t.co/lGSc8X53Vg',
 'https://t.co/lGSc8X53Vg']

#### Replace with URL tag

In [6]:
re.sub(r'https?://[www\.]?\w*\.[\w/$]*', '<URL>', 'https://t.co/lGSc8X53Vg #Trump’s border wall https://t.co/lGSc8X53Vg would be an ecological and financial disaster for the U.S. https://t.co/lGSc8X53Vg')

'<URL> #Trump’s border wall <URL> would be an ecological and financial disaster for the U.S. <URL>'

In [7]:
tweets['tweet_text_url_token'] = \
tweets['tweet_text'].apply(lambda x:re.sub(r'(https?://[www\.]?\w*\.[\w/$]*)', '<URL>', x))

tweets['tweet_urls'] = \
tweets['tweet_text'].apply(lambda x:re.findall(r'(https?://[www\.]?\w*\.[\w/$]*)', x))

In [9]:
docs = tweets[:5]

In [11]:
docs

Unnamed: 0,tweet_id,tweet_text,user_id,tweet_time,no_favs,no_retweets,urls,tweet_text_url_token,tweet_urls
0,851485433003421701,#Trump’s border wall would be an ecological and financial disaster for the U.S. https://t.co/lGSc8X53Vg,613521891,2017-04-10T17:22:01.000Z,0,0,https://www.vox.com/energy-and-environment/2017/4/10/14471304/trump-border-wall-animals,#Trump’s border wall would be an ecological and financial disaster for the U.S. <URL>,[https://t.co/lGSc8X53Vg]
1,851514117374398464,#Trump’s border wall would be an ecological and financial disaster for the U.S. https://t.co/v1TW467479,222481411,2017-04-10T19:16:00.000Z,0,0,https://www.vox.com/energy-and-environment/2017/4/10/14471304/trump-border-wall-animals,#Trump’s border wall would be an ecological and financial disaster for the U.S. <URL>,[https://t.co/v1TW467479]
2,855009400670617600,The research and experts are clear: Tougher border security measures do not actually stop drug trafficking https://t.co/ClfsLDuDCO,2347049341,2017-04-20T10:45:01.000Z,0,0,https://www.vox.com/policy-and-politics/2017/4/19/15326286/trump-wall-opioid-epidemic,The research and experts are clear: Tougher border security measures do not actually stop drug trafficking <URL>,[https://t.co/ClfsLDuDCO]
3,855009683542728704,Hmmmmmm https://t.co/y3fNx7KuJT,243580387,2017-04-20T10:46:08.000Z,0,0,https://www.vox.com/policy-and-politics/2017/4/19/15326286/trump-wall-opioid-epidemic,Hmmmmmm <URL>,[https://t.co/y3fNx7KuJT]
4,855011873183342592,(psst! It was never about drugs. It's always been about brown people. They holler DRUGS! to distract from their horrible racism) https://t.co/cMeFEnipLW,17889654,2017-04-20T10:54:50.000Z,0,0,https://www.vox.com/policy-and-politics/2017/4/19/15326286/trump-wall-opioid-epidemic,(psst! It was never about drugs. It's always been about brown people. They holler DRUGS! to distract from their horrible racism) <URL>,[https://t.co/cMeFEnipLW]


In [12]:
def create_docs(source):
    return_dict = {}
    for idx,row in source.iterrows():
        return_dict[row['tweet_id']] = [word for word in row['tweet_text_url_token'].lower().split()]
        
    return return_dict

In [13]:
tweets_dict = create_docs(docs)

In [14]:
tweets_dict[851485433003421701]

['#trump’s',
 'border',
 'wall',
 'would',
 'be',
 'an',
 'ecological',
 'and',
 'financial',
 'disaster',
 'for',
 'the',
 'u.s.',
 '<url>']

In [15]:
# for idx,row in docs.iterrows():
#     print(row['tweet_id'])

In [16]:
def create_dictionary_corpus(source_dict):
    dictionary = corpora.Dictionary()
    corpus = []
    
    for key, doc in source_dict.items():
        texts = [doc]
        dictionary.add_documents(texts)
        corpus.append(dictionary.doc2bow(texts[0]))
        
    return dictionary, corpus


In [18]:
dictionary, corpus = create_dictionary_corpus(tweets_dict)

In [19]:
index = similarities.MatrixSimilarity(corpus)

In [20]:
sims = index[corpus]

In [21]:
sims

array([[0.99999994, 0.99999994, 0.2592815 , 0.18898223, 0.05455447],
       [0.99999994, 0.99999994, 0.2592815 , 0.18898223, 0.05455447],
       [0.2592815 , 0.2592815 , 0.99999994, 0.17149858, 0.04950738],
       [0.18898223, 0.18898223, 0.17149858, 0.99999994, 0.14433756],
       [0.05455447, 0.05455447, 0.04950738, 0.14433756, 0.99999994]],
      dtype=float32)

### On bigger dataset

In [22]:
len(tweets)

201873

In [23]:
docs = tweets[:2000]

In [24]:
tweets_dict = create_docs(docs)

In [25]:
start_time = time.time()
dictionary, corpus = create_dictionary_corpus(tweets_dict)
print("---- %s seconds ----" % (time.time() - start_time ))

---- 0.13380193710327148 seconds ----


In [26]:
start_time = time.time()
index = similarities.MatrixSimilarity(corpus)
print("---- %s seconds ----" % (time.time() - start_time ))

---- 0.07704401016235352 seconds ----


In [27]:
sims = index[corpus]

In [29]:
sims.shape

(2000, 2000)