# Data Import

[Word2Vec](https://pathmind.com/wiki/word2vec)

# Load in datasets

In [1]:
import pandas as pd
import re
import string

cali = pd.read_csv('../data/scrape_dm_cali.csv')

mich = pd.read_csv('../data/scrape_dm_mich.csv')

ny = pd.read_csv('../data/scrape_dm_ny.csv')

ohio = pd.read_csv('../data/scrape_dm_ohio.csv')

texas = pd.read_csv('../data/scrape_dm_texas.csv')

tweets = pd.concat([cali, mich, ny, ohio, texas])

tweets.head()

Unnamed: 0,tweet_id,username,text,tweet_date,search_term,city,lat,long,radius,query_start
0,710245730590404608,TTWN SF Bay Area,Power outage in Cupertino #BayArea #Traffic ht...,2016-03-16 23:25:52,power outage,San Jose,37.3323,-121.853394,10mi,2016-01-01
1,708811502241734656,San Jose Now,WEATHER ALERT: Flash flood watch in Bay Area a...,2016-03-13 00:26:45,power outage,San Jose,37.3323,-121.853394,10mi,2016-01-01
2,706856719733776384,San Jose Now,Power outages:30 in San Francisco154 on Penins...,2016-03-07 14:59:09,power outage,San Jose,37.3323,-121.853394,10mi,2016-01-01
3,726876023573204993,San Jose Now,Power outage in Fremont. Several intersections...,2016-05-01 20:48:43,power outage,San Jose,37.3323,-121.853394,10mi,2016-01-01
4,724681945095888897,San Jose Now,"East Bay power outages also affects BART, UC B...",2016-04-25 19:30:14,power outage,San Jose,37.3323,-121.853394,10mi,2016-01-01


## Data Cleaning

In [2]:
tweets.shape

(20100, 10)

In [3]:
tweets.drop_duplicates(inplace=True)

In [4]:
tweets.shape

(16913, 10)

In [5]:
tweets.reset_index(inplace= True)

In [6]:
tweets.isnull().sum()

index          0
tweet_id       0
username       0
text           0
tweet_date     0
search_term    0
city           0
lat            0
long           0
radius         0
query_start    0
dtype: int64

## Function to clean tweets & usernames

In [8]:
def clean_str(string):
    string = string.lower()
    url_pattern = r'((http|ftp|https):\/\/)?[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?'
    string = re.sub(url_pattern, ' ', string)
    string = re.sub(r"\n", "", string)    
    string = re.sub(r"\r", "", string) 
    string = re.sub(r"[0-9]+", "", string)
    string = re.sub(r'[^\w\s]','', string)    
    
    return string.strip()

In [9]:
tweets['text'] = tweets['text'].map(clean_str)
tweets['username'] = tweets['username'].map(clean_str)

In [28]:
# making new column with username + tweet
tweets['name_and_tweet'] = tweets['username'] + " " +tweets['text']

## Word2Vec Stuff

In [12]:
# Import Word2Vec
from gensim.models.word2vec import Word2Vec


In [13]:
# turning cleaned tweets into list of lists

def tweet_to_words(tweets):
    # empty list of tweets
    list_of_tweets = []
    
    # make tweet into list of words
    for tweet in tweets:
        tweet = tweet.split()
    
        # list of stop words
        stops = ['los','angeles','san','diego','jose','columbus','cleveland',
             'cincinatti','detroit','ann','arbor','warren','new','york',
             'ny','buffalo','rochester','michigan','california','ohio','texas',
            'st','amc','scott','schudlich','finnished','de','antonio','la','houston',
                 'dallas', 'santa', 'ana', 'clara']
    
        # Remove stopwords.
        meaningful_tweet = [w for w in tweet if not w in stops]
    
        # add tweet to big list
        list_of_tweets.append(meaningful_tweet)

    return list_of_tweets

In [14]:
# making corpus
corpus = tweet_to_words(tweets['name_and_tweet'])

In [15]:

# # Train a model! 
model = Word2Vec(corpus,      # Corpus of data.
                  size=100,    # How many dimensions do you want in your word vector?
                  window=5,    # How many "context words" do you want?
                  min_count=1, # Ignores words below this threshold.
                  sg=1,        # SG = 1 uses SkipGram, SG = 0 uses CBOW (default).
                  workers=4)   # Number of "worker threads" to use (parallelizes process).

# # Do what you'd like to do with your data!
# model.most_similar("car")

## Testing out words

In [27]:
model.most_similar('poweroutage')

  """Entry point for launching an IPython kernel.


[('ladwp', 0.804068922996521),
 ('park', 0.7910938262939453),
 ('grand', 0.7867336273193359),
 ('outage', 0.7846530675888062),
 ('mesa', 0.7829866409301758),
 ('area', 0.7741690874099731),
 ('kearny', 0.7728931903839111),
 ('news', 0.7725344896316528),
 ('ttwn', 0.7663310766220093),
 ('near', 0.7647380828857422)]

In [24]:
model.most_similar('blackout')

  """Entry point for launching an IPython kernel.


[('dukeenergy', 0.979924201965332),
 ('large', 0.9769753217697144),
 ('tornado', 0.9761412143707275),
 ('flooding', 0.9750644564628601),
 ('sce', 0.9732176065444946),
 ('kfmb', 0.9729523658752441),
 ('ice', 0.9723016023635864),
 ('brief', 0.9722714424133301),
 ('partial', 0.9714536070823669),
 ('se', 0.9696272611618042)]

## Code to check for words in tweets

In [29]:
for tweet in tweets['name_and_tweet']:
    if 'kfmb' in tweet:
        print(tweet)

times of san diego power outage in kearny mesa keeping cbs kfmb news off air as of
cbs news power outage in kearny mesa leaves kfmb studios in the dark
news  san diego power outage in kearny mesa leaves kfmb studios in the dark
carlo cecchetto we got hit with a power outage at kfmb were trying to get everything working well try to get news  at  on the air but its not likely well have a full newscast and were hoping to be on for news  at  poweroutage sdge
times of san diego power outage in kearny mesa keeping cbs kfmb news off air as of
carlo cecchetto we got hit with a power outage at kfmb were trying to get everything working well try to get news  at  on the air but its not likely well have a full newscast and were hoping to be on for news  at  poweroutage sdge
carlo cecchetto we got hit with a power outage at kfmb were trying to get everything working well try to get news  at  on the air but its not likely well have a full newscast and were hoping to be on for news  at  poweroutage s