In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import operator
import unicodedata




Initializing

In [8]:
bin_num = 100
data = pd.read_csv('Immigration_news_data_content_bin' + str(bin_num) + '(2).csv')
year = data['date'].tolist()
content = data['content'].tolist()

Load the data from local

In [9]:
print len(content)

100


There are around 100 bins here 

In [10]:
print data.head()

   Unnamed: 0                                            content  \
0           0  israeli paper says jews released ceausescu pai...   
1           0  religion notes lead giving gospels away weeks ...   
2           0  florida center holding aliens inquiry lead ami...   
3           0  brooklyn landlord keep racial mix looks soviet...   
4           0  israeli committee backs settlement east jerusa...   

                    date  
0  1990-01-01,1990-03-09  
1  1990-03-10,1990-05-16  
2  1990-05-16,1990-07-24  
3  1990-07-24,1990-10-16  
4  1990-10-16,1991-01-06  


In [11]:
word_dict = {}
vectorizer = TfidfVectorizer(max_df=.7, max_features=20000,
                             min_df=.02, stop_words='english',
                            use_idf=True, ngram_range=(1,3))
tfidf_matrix = vectorizer.fit_transform(content)
idf = vectorizer.idf_
feature_names = vectorizer.get_feature_names()

words = dict(zip(vectorizer.get_feature_names(), idf))
for key, val in words.iteritems():
    word_dict[key]=val


Use TF-IDF from Sklearn to analyze the content

In [12]:
print tfidf_matrix.shape

(100, 20000)


100 is the row number, the number of bins. 200000 is the total number of featured words here, the column number

In [13]:
print len(word_dict)

20000


Sort the words by their idf values

In [14]:
keyword_list = sorted(word_dict.items(), key=operator.itemgetter(1),reverse=True)

In [15]:
for l in keyword_list:
    print l

In [16]:
dense = tfidf_matrix.todense()

Transfer the sklearn sparse matrix into a dense matrix

In [18]:
clusters = []
for i in range(len(content)):
    cluster = dense[i].tolist()[0]
    clusters.append(cluster)

In [27]:

def extract_keywords(data):
    phrase_scores = []
    for pair in zip(range(0, len(data)), data):
        if pair[1] > 0:
            phrase_scores.append(pair)
    
    sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
    keywords = []
    keywords_pair = []
    for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores]:
        phrase = unicodedata.normalize('NFKD', phrase).encode('ascii','ignore')
        keywords.append(phrase)
        keywords_pair.append((phrase, score))
    return keywords, keywords_pair


Extract the keywords for every row, the bin from the matrix

In [28]:
keyword_list = []
keywords_pair_list = []
for cluster in clusters:
    keywords, keywords_pair = extract_keywords(cluster)
    keyword_list.append(keywords[:10])
    keywords_pair_list.append(keywords_pair)

In [29]:
new_data = { 'content': content, "date": year, "keywords": keyword_list, "keywords pair": keywords_pair_list }

frame = pd.DataFrame(new_data, columns = ['content', "date", "keywords", "keywords pair"])
frame.to_csv('Immigration_news_data_content_bin' + str(bin_num) + '.csv')

In [30]:
frame = pd.read_csv('Immigration_news_data_content_bin' + str(bin_num) + '.csv')
print frame.head()

   Unnamed: 0                                            content  \
0           0  israeli paper says jews released ceausescu pai...   
1           1  religion notes lead giving gospels away weeks ...   
2           2  florida center holding aliens inquiry lead ami...   
3           3  brooklyn landlord keep racial mix looks soviet...   
4           4  israeli committee backs settlement east jerusa...   

                    date                                           keywords  \
0  1990-01-01,1990-03-09  ['immigration emigration', 'emigration', 'lead...   
1  1990-03-10,1990-05-16  ['immigration emigration', 'emigration', 'sovi...   
2  1990-05-16,1990-07-24  ['immigration emigration', 'emigration', 'gorb...   
3  1990-07-24,1990-10-16  ['immigration emigration', 'emigration', 'elli...   
4  1990-10-16,1991-01-06  ['immigration emigration', 'emigration', 'weir...   

                                       keywords pair  
0  [('immigration emigration', 0.4133313415845711...  
1  [('

In [58]:
# sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
# for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:20]:
#    print('{0: <20} {1}'.format(phrase, score))

tyson                0.356025083595
foods                0.180862562832
tyson foods          0.180862562832
hunger               0.157924144013
hunger strike        0.15071880236
guilty               0.129637653581
tuition              0.129637653581
kennedy              0.126339315211
middle eastern       0.126339315211
inmate               0.120575041888
singapore            0.117234004377
australia            0.114213758153
pilot                0.114213758153
trial                0.112352633104
government said      0.10551060394
taiwan               0.10551060394
hong                 0.103710122865
hong kong            0.103710122865
kong                 0.103710122865
strike               0.103710122865
