vector and count the words in tweets.

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
import pandas as pd

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
from sqlalchemy import func, inspect
from sqlalchemy import Table, Column, Integer, String, Float, DateTime, MetaData

In [24]:
# Create an engine for the  database

engine = create_engine("sqlite:///../data/data.sqlite", echo=False)
conn = engine.connect()

# Reflect Database into ORM classes
Base = automap_base()
Base.prepare(engine, reflect=True)
meta = MetaData()

tweets = Table(
   'tweets', meta, 
    Column('source',String), 
    Column('text', String), 
    Column('created_at', DateTime),
    Column('retweet_count', Integer),
    Column('favorite_count', Integer),
    Column('id_str', Integer, primary_key = True)
)

no_retweets = Table(
   'no_retweets', meta, 
    Column('source',String), 
    Column('text', String), 
    Column('created_at', DateTime),
    Column('retweet_count', Integer),
    Column('favorite_count', Integer),
    Column('id_str', Integer, primary_key = True)
)


In [25]:
#- select column 'text' from the tweets table - this has no retweets
tweets_text_data = pd.read_sql('SELECT TEXT FROM TWEETS', conn)


In [26]:
tweets = tweets_text_data.text
tweets.head()

0          "@flicka__: @realDonaldTrump for president"
1    The Mar-a-Lago Club was amazing tonight. Every...
2    "@archangeljf12:  ;,@realDonaldTrump for Presi...
3    "@TalentlessCook: @realDonaldTrump You're only...
4    "@yankeejayman: @realDonaldTrump @flicka__ Do ...
Name: text, dtype: object

In [27]:



# To create a Count Vectorizer, we simply need to instantiate one.
# There are special parameters we can set here when making the vectorizer, but
# for the most basic example, it is not needed.
vectorizer = CountVectorizer( lowercase=True, token_pattern=r'\b[^\d\W]+\b', stop_words='english')

# For our text, we are going to take some text from our previous blog post
# about count vectorization
# sample_text = ["One of the most basic ways we can numerically represent words "
#                "is through the one-hot encoding method (also sometimes called "
#                "count vectorizing)."]


vector_text = tweets

# To actually create the vectorizer, we simply need to call fit on the text
# data that we wish to fix
vectorizer.fit(vector_text)

# Now, we can inspect how our vectorizer vectorized the text
# This will print out a list of words used, and their index in the vectors
print('Vocabulary: ')
print(vectorizer.vocabulary_)

# # If we would like to actually create a vector, we can do so by passing the
# # text into the vectorizer to get back counts
# vector = vectorizer.transform(vector_text)

# # Our final vector:
# print('Full vector: ')
# print(vector.toarray())

# # Or if we wanted to get the vector for one word:
# print('Hot vector: ')
# print(vectorizer.transform(['hot']).toarray())

# # Or if we wanted to get multiple vectors at once to build matrices
# print('Hot and one: ')
# print(vectorizer.transform(['hot', 'one']).toarray())

# # We could also do the whole thing at once with the fit_transform method:
# print('One swoop:')
# new_text = ['Today is the day that I do the thing today, today']
# new_vectorizer = CountVectorizer()
# print(new_vectorizer.fit_transform(new_text).toarray())

Vocabulary: 


In [28]:
words = vectorizer.vocabulary_

In [29]:
word_list = list(words.keys())

In [30]:
word_list

['flicka__',
 'realdonaldtrump',
 'president',
 'mar',
 'lago',
 'club',
 'amazing',
 'tonight',
 'everybody',
 'biggest',
 'hottest',
 'palm',
 'beach',
 'lucky',
 'best',
 'world',
 'united',
 'states',
 'sentedcruz',
 'vice',
 'talentlesscook',
 'year',
 'away',
 'pick',
 'running',
 'mate',
 'happy',
 'new',
 'yankeejayman',
 'u',
 'sleep',
 'sir',
 't',
 'wait',
 'celebrityapprentice',
 'sparkleawin',
 'excited',
 'amp',
 'celeb',
 'apprentice',
 'trumps',
 'boardroom',
 'believe',
 'bring',
 'country',
 'mr',
 'trump',
 'thanks',
 'dan',
 'run',
 'needs',
 'greatest',
 'places',
 'live',
 'happynewyear',
 'sarapattersonn',
 'hey',
 'ericleebow',
 'hope',
 'finally',
 's',
 'resolution',
 'good',
 'thought',
 'felix_whiskas',
 'interested',
 'balancing',
 'work',
 'pleasure',
 'stop',
 'instead',
 'make',
 'pleasurable',
 'samanthajrose',
 'trumpfan',
 'trumpforpresident',
 'iintend',
 'donaldtrump',
 'time',
 'million',
 'debt',
 'came',
 'flinched',
 'respect',
 'thank',
 'seank

In [31]:
_word_ = {'word': word_list}

In [32]:
word_df = pd.DataFrame(data=_word_)

In [33]:
word_df.head()

Unnamed: 0,word
0,flicka__
1,realdonaldtrump
2,president
3,mar
4,lago


In [34]:
word_df.to_csv('../data/words.csv', index=False)

# Save the tweets to a csv file for count

In [35]:
tweets_text_data.to_csv('../data/tweet_only.csv', index=False)