vector and count the words in tweets.

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
import pandas as pd

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
from sqlalchemy import func, inspect
from sqlalchemy import Table, Column, Integer, String, Float, DateTime, MetaData

In [3]:
# Create an engine for the  FemaData.db database

engine = create_engine("sqlite:///../data/data.sqlite", echo=False)
conn = engine.connect()

# Reflect Database into ORM classes
Base = automap_base()
Base.prepare(engine, reflect=True)
meta = MetaData()

tweets = Table(
   'tweets', meta, 
    Column('source',String), 
    Column('text', String), 
    Column('created_at', DateTime),
    Column('retweet_count', Integer),
    Column('favorite_count', Integer),
    Column('id_str', Integer, primary_key = True)
)

no_retweets = Table(
   'no_retweets', meta, 
    Column('source',String), 
    Column('text', String), 
    Column('created_at', DateTime),
    Column('retweet_count', Integer),
    Column('favorite_count', Integer),
    Column('id_str', Integer, primary_key = True)
)


In [4]:
#- select column 'text' from the tweets table - this has no retweets
tweets_text_data = pd.read_sql('SELECT TEXT FROM TWEETS', conn)


In [5]:
tweets = tweets_text_data.text
tweets.head()

0          "@flicka__: @realDonaldTrump for president"
1    The Mar-a-Lago Club was amazing tonight. Every...
2    "@archangeljf12:  ;,@realDonaldTrump for Presi...
3    "@TalentlessCook: @realDonaldTrump You're only...
4    "@yankeejayman: @realDonaldTrump @flicka__ Do ...
Name: text, dtype: object

In [6]:



# To create a Count Vectorizer, we simply need to instantiate one.
# There are special parameters we can set here when making the vectorizer, but
# for the most basic example, it is not needed.
vectorizer = CountVectorizer( lowercase=True, stop_words='english')

# For our text, we are going to take some text from our previous blog post
# about count vectorization
# sample_text = ["One of the most basic ways we can numerically represent words "
#                "is through the one-hot encoding method (also sometimes called "
#                "count vectorizing)."]


vector_text = tweets

# To actually create the vectorizer, we simply need to call fit on the text
# data that we wish to fix
vectorizer.fit(vector_text)

# Now, we can inspect how our vectorizer vectorized the text
# This will print out a list of words used, and their index in the vectors
print('Vocabulary: ')
print(vectorizer.vocabulary_)

# # If we would like to actually create a vector, we can do so by passing the
# # text into the vectorizer to get back counts
# vector = vectorizer.transform(vector_text)

# # Our final vector:
# print('Full vector: ')
# print(vector.toarray())

# # Or if we wanted to get the vector for one word:
# print('Hot vector: ')
# print(vectorizer.transform(['hot']).toarray())

# # Or if we wanted to get multiple vectors at once to build matrices
# print('Hot and one: ')
# print(vectorizer.transform(['hot', 'one']).toarray())

# # We could also do the whole thing at once with the fit_transform method:
# print('One swoop:')
# new_text = ['Today is the day that I do the thing today, today']
# new_vectorizer = CountVectorizer()
# print(new_vectorizer.fit_transform(new_text).toarray())

Vocabulary: 


In [10]:
tweets_text_data.to_csv('../data/tweet_only.csv')

In [15]:
words = vectorizer.vocabulary_

In [16]:
words

{'flicka__': 10339,
 'realdonaldtrump': 21820,
 'president': 20792,
 'mar': 16821,
 'lago': 15630,
 'club': 5969,
 'amazing': 2613,
 'tonight': 26718,
 'everybody': 9568,
 'biggest': 4071,
 'hottest': 12628,
 'palm': 19747,
 'beach': 3790,
 'lucky': 16476,
 'best': 3966,
 'world': 29392,
 'archangeljf12': 3031,
 'united': 27740,
 'states': 25068,
 'sentedcruz': 23894,
 'vice': 28332,
 'winningticket2016': 29207,
 'trumpcruz2016': 27078,
 'talentlesscook': 25821,
 'year': 29940,
 'away': 3422,
 '2016': 479,
 'pick': 20275,
 'running': 23108,
 'mate': 17001,
 'happy': 12002,
 'new': 18548,
 'yankeejayman': 29900,
 'sleep': 24456,
 'joshhammer77': 14596,
 'sir': 24335,
 'wait': 28707,
 'celebrityapprentice': 5379,
 'sparkleawin': 24783,
 'excited': 9637,
 'amp': 2693,
 'celeb': 5364,
 'apprentice': 2979,
 'trumps': 27150,
 'boardroom': 4358,
 'danwolff22': 7240,
 'believe': 3881,
 'bring': 4709,
 'country': 6692,
 'mr': 17985,
 'trump': 27048,
 'thanks': 26229,
 'dan': 7187,
 'tlowery19':

In [31]:
word_df = pd.DataFrame.from_dict(words, orient='index')

In [32]:
word_df.head()

Unnamed: 0,0
flicka__,10339
realdonaldtrump,21820
president,20792
mar,16821
lago,15630


In [33]:
word_df.to_csv('../data/words.csv')

In [34]:
words2 = pd.read_csv('../data/words.csv')

In [35]:
words2.head()

Unnamed: 0,word,nmbr
0,flicka__,10339
1,realdonaldtrump,21820
2,president,20792
3,mar,16821
4,lago,15630


In [36]:
wordslist = words2['word']

In [37]:
wordslist

0               flicka__
1        realdonaldtrump
2              president
3                    mar
4                   lago
              ...       
30548           sullivan
30549         wzlb5s41m3
30550            severly
30551          chastised
30552            denials
Name: word, Length: 30553, dtype: object