# Data Import

[Word2Vec](https://pathmind.com/wiki/word2vec)

# Load in datasets

In [30]:
import pandas as pd
import re
import string

cali = pd.read_csv('../data/scrape_dm_cali.csv')

mich = pd.read_csv('../data/scrape_dm_mich.csv')

ny = pd.read_csv('../data/scrape_dm_ny.csv')

ohio = pd.read_csv('../data/scrape_dm_ohio.csv')

texas = pd.read_csv('../data/scrape_dm_texas.csv')

tweets = pd.concat([cali, mich, ny, ohio, texas])

tweets.head()

Unnamed: 0,tweet_id,username,text,tweet_date,search_term,city,lat,long,radius,query_start
0,710245730590404608,TTWN SF Bay Area,Power outage in Cupertino #BayArea #Traffic ht...,2016-03-16 23:25:52,power outage,San Jose,37.3323,-121.853394,10mi,2016-01-01
1,708811502241734656,San Jose Now,WEATHER ALERT: Flash flood watch in Bay Area a...,2016-03-13 00:26:45,power outage,San Jose,37.3323,-121.853394,10mi,2016-01-01
2,706856719733776384,San Jose Now,Power outages:30 in San Francisco154 on Penins...,2016-03-07 14:59:09,power outage,San Jose,37.3323,-121.853394,10mi,2016-01-01
3,726876023573204993,San Jose Now,Power outage in Fremont. Several intersections...,2016-05-01 20:48:43,power outage,San Jose,37.3323,-121.853394,10mi,2016-01-01
4,724681945095888897,San Jose Now,"East Bay power outages also affects BART, UC B...",2016-04-25 19:30:14,power outage,San Jose,37.3323,-121.853394,10mi,2016-01-01


## Data Cleaning

In [31]:
tweets.shape

(20100, 10)

In [32]:
tweets.drop_duplicates(inplace=True)

In [33]:
tweets.shape

(16913, 10)

In [34]:
tweets.reset_index(inplace= True)

In [35]:
tweets.isnull().sum()

index          0
tweet_id       0
username       0
text           0
tweet_date     0
search_term    0
city           0
lat            0
long           0
radius         0
query_start    0
dtype: int64

## Function to clean tweets & usernames

In [36]:
def clean_str(string):
    string = string.lower()
    url_pattern = r'((http|ftp|https):\/\/)?[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?'
    string = re.sub(url_pattern, ' ', string)
    string = re.sub(r"\n", "", string)    
    string = re.sub(r"\r", "", string) 
    string = re.sub(r"[0-9]+", "", string)
    string = re.sub(r'[^\w\s]','', string)    
    
    return string.strip()

In [37]:
tweets['text'] = tweets['text'].map(clean_str)
tweets['username'] = tweets['username'].map(clean_str)

In [38]:
# making new column with username + tweet
tweets['name_and_tweet'] = tweets['username'] + " " +tweets['text']

## Word2Vec Stuff

In [39]:
# Import Word2Vec
from gensim.models.word2vec import Word2Vec


In [78]:
# turning cleaned tweets into list of lists

def tweet_to_words(tweets):
    # empty list of tweets
    list_of_tweets = []
    
    # make tweet into list of words
    for tweet in tweets:
        tweet = tweet.split()
    
        # list of stop words
        stops = ['los','angeles','san','diego','jose','columbus','cleveland',
             'cincinatti','detroit','ann','arbor','warren','new','york',
             'ny','buffalo','rochester','michigan','california','ohio','texas',
            'st','amc','scott','schudlich','finnished','de','antonio','la','houston',
                 'dallas', 'santa', 'ana', 'clara', 'grand', 'rapids', 'kearny','mesa',
                'peticolas','christmas']
    
        # Remove stopwords.
        meaningful_tweet = [w for w in tweet if not w in stops]
    
        # add tweet to big list
        list_of_tweets.append(meaningful_tweet)

    return list_of_tweets

In [79]:
# making corpus
corpus = tweet_to_words(tweets['name_and_tweet'])

In [80]:

# # Train a model! 
model = Word2Vec(corpus,      # Corpus of data.
                  size=100,    # How many dimensions do you want in your word vector?
                  window=5,    # How many "context words" do you want?
                  min_count=1, # Ignores words below this threshold.
                  sg=1,        # SG = 1 uses SkipGram, SG = 0 uses CBOW (default).
                  workers=4)   # Number of "worker threads" to use (parallelizes process).

# # Do what you'd like to do with your data!
# model.most_similar("car")

## Testing out words

In [99]:
model.most_similar('dte')

  """Entry point for launching an IPython kernel.


[('aep', 0.9570109844207764),
 ('currently', 0.9534438848495483),
 ('reported', 0.9530239105224609),
 ('update', 0.9519146680831909),
 ('widespread', 0.950791597366333),
 ('wind', 0.9502784013748169),
 ('planned', 0.9488312602043152),
 ('experiencing', 0.9479764699935913),
 ('affecting', 0.9418179392814636),
 ('news', 0.9389858841896057)]

In [100]:
model.most_similar('outages')

  """Entry point for launching an IPython kernel.


[('customers', 0.8877463340759277),
 ('crews', 0.8768340349197388),
 ('without', 0.8660569190979004),
 ('lines', 0.8516985774040222),
 ('affecting', 0.8510775566101074),
 ('experiencing', 0.8509665727615356),
 ('restoration', 0.8467317223548889),
 ('county', 0.8454038500785828),
 ('reports', 0.8447811603546143),
 ('area', 0.8438729047775269)]

In [90]:
model.most_similar('lights')

  """Entry point for launching an IPython kernel.


[('light', 0.6927928924560547),
 ('music', 0.6893447637557983),
 ('hanging', 0.683314323425293),
 ('totally', 0.6826909780502319),
 ('tonight', 0.6807053089141846),
 ('checking', 0.6710796356201172),
 ('bar', 0.6680662631988525),
 ('georgia', 0.6608129143714905),
 ('friday', 0.6594502329826355),
 ('playing', 0.6566678285598755)]

## Code to check for words in tweets

In [93]:
for tweet in tweets['name_and_tweet']:
    if 'powerout' in tweet:
        print(tweet)

lil jinni poweroutage at sanjose fryselectronics if only they sold generators  frys electronics
kristy wilce i was already home before this went out i love being nonessential poweroutage noworkforme
san jose now tech security ceo ajay k arora on multiple poweroutage there is no such thing as coincidence when it comes to
san jose now poweroutage triggers traffic nightmare getting into and around san francisco
san jose now update bart station at montgomery st has reopened but gridlock continues amid sf poweroutage
san jose now number of pge customers affected by san francisco poweroutage rises to
san jose now pge now says  people are without power in san francisco poweroutage
assetburned shopgreatmall bad information management about poweroutage security just patrolling but not informing anyone
spiffy and smurfy sanjosepd pgeme cityofsanjose    ringwood and mckay just lost power poweroutage
trishas trying to be nice today nycpoweroutage lets all pray this is just a blackout oopsthe last 