In [1]:
import string
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, euclidean_distances


import os

# Importing all of the Fed Speeches
import pickle

In [2]:
df = pickle.load(open("mvp_fed_press_rel", 'rb'))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 2 columns):
date    42 non-null datetime64[ns]
text    42 non-null object
dtypes: datetime64[ns](1), object(1)
memory usage: 752.0+ bytes


In [8]:
df['text'][40]

"\n       Information received since the Federal Open Market Committee met in September suggests that economic activity is expanding at a moderate pace. Labor market conditions improved somewhat further, with solid job gains and a lower unemployment rate. On balance, a range of labor market indicators suggests that underutilization of labor resources is gradually diminishing. Household spending is rising moderately and business fixed investment is advancing, while the recovery in the housing sector remains slow. Inflation has continued to run below the Committee's longer-run objective. Market-based measures of inflation compensation have declined somewhat; survey-based measures of longer-term inflation expectations have remained stable.\n    \n       Consistent with its statutory mandate, the Committee seeks to foster maximum employment and price stability. The Committee expects that, with appropriate policy accommodation, economic activity will expand at a moderate pace, with labor ma

In [9]:
df.sort_values(by=['date'], ascending = False, inplace = True)
df.reset_index(drop=True, inplace=True)


In [10]:
df.head()

Unnamed: 0,date,text
0,2019-03-20,Information received since the Federal Open Ma...
1,2019-01-30,Information received since the Federal Open Ma...
2,2018-12-19,Information received since the Federal Open Ma...
3,2018-11-08,Information received since the Federal Open Ma...
4,2018-09-26,Information received since the Federal Open Ma...


In [11]:
df.tail()

Unnamed: 0,date,text
37,2014-07-30,\n Information received since the Federa...
38,2014-06-18,\n Information received since the Federa...
39,2014-04-30,\n Information received since the Federa...
40,2014-03-19,\n Information received since the Federa...
41,2014-01-29,\n Information received since the Federa...


In [17]:
i = 5
this_date = df['date'][i]
n_speeches = 5

In [15]:
this_date

Timestamp('2018-08-01 00:00:00')

In [12]:
def create_speech_dfs(df, date, numb_speeches):
    '''
    For a given date, the most recent speeches numb_speeches and the next date speeches
    are returned as dataframes

    INPUTS:
        df - the dataframe of all fed speeches
        date - the date needed to include no speeches before
        num_speeches - the number of most recent speeches to include

    OUTPUTS:
        hist_df - this is a subset of the original dataframe with same columns
        new_df - a subset of the original dataframe for the new speeches
    '''
    hist_df = df[df['date']< date]
    if len(hist_df)> numb_speeches:
            hist_df = hist_df.iloc[0:numb_speeches]

    new_df = df[df['date']==date]

    return hist_df, new_df

In [18]:
h_df, n_df = create_speech_dfs(df, this_date, n_speeches)

In [20]:
n_df

Unnamed: 0,date,text
5,2018-08-01,Information received since the Federal Open Ma...


In [21]:
doc_list = list(h_df['text'])

In [22]:
doc_list

["Information received since the Federal Open Market Committee met in May indicates that the labor market has continued to strengthen and that economic activity has been rising at a solid rate. Job gains have been strong, on average, in recent months, and the unemployment rate has declined. Recent data suggest that growth of household spending has picked up, while business fixed investment has continued to grow strongly. On a 12-month basis, both overall inflation and inflation for items other than food and energy have moved close to 2 percent. Indicators of longer-term inflation expectations are little changed, on balance.Consistent with its statutory mandate, the Committee seeks to foster maximum employment and price stability. The Committee expects that further gradual increases in the target range for the federal funds rate will be consistent with sustained expansion of economic activity, strong labor market conditions, and inflation near the Committee's symmetric 2 percent objecti

In [34]:
tfidvect =TfidfVectorizer(lowercase=True,
                          stop_words='english',
                          max_features = 2000,
                          norm = 'l2',
                          use_idf = True,
                          smooth_idf=True,
                          sublinear_tf = False,
                          ngram_range = (1,3))

In [35]:
tfidf_vectorized = tfidvect.fit_transform(doc_list).toarray()

In [36]:
tfidvect.fit_transform(doc_list)

<5x1062 sparse matrix of type '<class 'numpy.float64'>'
	with 3211 stored elements in Compressed Sparse Row format>

In [38]:
new_text = n_df['text']
new_tokens = tfidf_vectorized.transform(new_text)

AttributeError: 'numpy.ndarray' object has no attribute 'transform'

In [39]:
def transform_new_speech(new_df, model):
    '''
    takes the model fit on historical speeches and fits the new text to this model
    returns new_tokens which can go into the similarity calculation
    '''
    new_text = new_df['text']
    new_tokens = model.transform(new_text)
    return new_tokens

In [40]:
new_tokens = transform_new_speech(n_df, tfidvect)

In [41]:
new_tokens

<1x1062 sparse matrix of type '<class 'numpy.float64'>'
	with 442 stored elements in Compressed Sparse Row format>

In [42]:
tfidvect.vocabulary_


{'information': 470,
 'received': 851,
 'federal': 326,
 'open': 703,
 'market': 581,
 'committee': 135,
 'met': 621,
 'indicates': 428,
 'labor': 532,
 'continued': 190,
 'strengthen': 961,
 'economic': 243,
 'activity': 31,
 'rising': 891,
 'solid': 922,
 'rate': 806,
 'job': 516,
 'gains': 371,
 'strong': 967,
 'average': 66,
 'recent': 854,
 'months': 657,
 'unemployment': 1026,
 'declined': 215,
 'data': 200,
 'suggest': 982,
 'growth': 391,
 'household': 401,
 'spending': 932,
 'picked': 746,
 'business': 103,
 'fixed': 334,
 'investment': 482,
 'grow': 388,
 'strongly': 979,
 '12': 0,
 'month': 653,
 'basis': 87,
 'overall': 715,
 'inflation': 436,
 'items': 501,
 'food': 346,
 'energy': 270,
 'moved': 668,
 'close': 125,
 'percent': 731,
 'indicators': 431,
 'longer': 549,
 'term': 1006,
 'expectations': 298,
 'little': 546,
 'changed': 119,
 'balance': 72,
 'consistent': 185,
 'statutory': 954,
 'mandate': 570,
 'seeks': 916,
 'foster': 351,
 'maximum': 596,
 'employment': 259

In [46]:
cosine_sims = linear_kernel(new_tokens, tfidf_vectorized)
cosine_sims

array([[0.90297468, 0.74575048, 0.67955391, 0.68107108, 0.57003126]])

In [49]:
euclid_dist = euclidean_distances(new_tokens, tfidf_vectorized)
euclid_dist

array([[0.44051179, 0.71309119, 0.80055742, 0.79866003, 0.92732814]])

In [57]:
# need to find the max tfid
tfidf_vectorized.shape

(5, 1062)

In [52]:
whos

Variable               Type                Data/Info
----------------------------------------------------
CountVectorizer        type                <class 'sklearn.feature_e<...>on.text.CountVectorizer'>
PorterStemmer          ABCMeta             <class 'nltk.stem.porter.PorterStemmer'>
RegexpTokenizer        ABCMeta             <class 'nltk.tokenize.regexp.RegexpTokenizer'>
SnowballStemmer        ABCMeta             <class 'nltk.stem.snowball.SnowballStemmer'>
TfidfVectorizer        type                <class 'sklearn.feature_e<...>on.text.TfidfVectorizer'>
WordNetLemmatizer      type                <class 'nltk.stem.wordnet.WordNetLemmatizer'>
cosine_sims            ndarray             1x5: 5 elems, type `float64`, 40 bytes
create_speech_dfs      function            <function create_speech_dfs at 0x10bbf5268>
df                     DataFrame                    date            <...>eived since the Federa...
doc_list               list                n=5
euclid_dist            ndarray