In [5]:
import graphlab
graphlab.canvas.set_target("ipynb")
In [6]:
sf = graphlab.SFrame.read_csv("/Users/zhaoenche/desktop/haha/data/w15", header=False)
This non-commercial license of GraphLab Create for academic use is assigned to 16210130103@fudan.edu.cn and will expire on May 05, 2018.
[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1495191319.log
Finished parsing file /Users/zhaoenche/desktop/haha/data/w15
Parsing completed. Parsed 100 lines in 0.894406 secs.
------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
Read 12278 lines. Lines per second: 9608.88
Finished parsing file /Users/zhaoenche/desktop/haha/data/w15
Parsing completed. Parsed 72269 lines in 2.63881 secs.
In [7]:
sf
Out[7]:
X1
aynrand born and educated
in russia rand migrated ...
asphalt in american
english asphalt or ...
actinopterygii the
actinopterygii consti ...
altaiclanguages these
language families share ...
argon the name argon is
derived from the greek ...
augustderleth a 1938
guggenheim fellow der ...
amateur amateurism can be
seen in both a negative ...
assemblyline an assembly
line is a manufacturing ...
astronomicalunit an
astronomical unit ...
abbess an abbess latin
abbatissa feminine form ...
[72269 rows x 1 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [8]:
dir(sf['X1'])
Out[8]:
['_SArray__check_min_observations',
 '_SArray__construct_ctr',
 '__abs__',
 '__add__',
 '__and__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__div__',
 '__doc__',
 '__eq__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__get_content_identifier__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__has_size__',
 '__hash__',
 '__init__',
 '__is_materialized__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__materialize__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__proxy__',
 '__radd__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',
 '__rmod__',
 '__rmul__',
 '__rpow__',
 '__rsub__',
 '__rtruediv__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__truediv__',
 '_count_ngrams',
 '_count_words',
 '_getitem_cache',
 '_save_as_text',
 'all',
 'any',
 'append',
 'apply',
 'argmax',
 'argmin',
 'astype',
 'clip',
 'clip_lower',
 'clip_upper',
 'contains',
 'cumulative_max',
 'cumulative_mean',
 'cumulative_min',
 'cumulative_std',
 'cumulative_sum',
 'cumulative_var',
 'date_range',
 'datetime_to_str',
 'dict_has_all_keys',
 'dict_has_any_keys',
 'dict_keys',
 'dict_trim_by_keys',
 'dict_trim_by_values',
 'dict_values',
 'dropna',
 'dtype',
 'fillna',
 'filter',
 'from_avro',
 'from_const',
 'from_sequence',
 'hash',
 'head',
 'is_in',
 'is_materialized',
 'item_length',
 'materialize',
 'max',
 'mean',
 'min',
 'nnz',
 'num_missing',
 'pixel_array_to_image',
 'random_integers',
 'random_split',
 'rolling_count',
 'rolling_max',
 'rolling_mean',
 'rolling_min',
 'rolling_stdv',
 'rolling_sum',
 'rolling_var',
 'sample',
 'save',
 'shape',
 'show',
 'size',
 'sketch_summary',
 'sort',
 'split_datetime',
 'std',
 'str_to_datetime',
 'subslice',
 'sum',
 'tail',
 'to_numpy',
 'topk_index',
 'unique',
 'unpack',
 'var',
 'vector_slice',
 'where']
In [9]:
bow = sf['X1']._count_words()
In [10]:
type(sf['X1'])
Out[10]:
graphlab.data_structures.sarray.SArray
In [11]:
type(bow)
Out[11]:
graphlab.data_structures.sarray.SArray
In [12]:
bow.dict_has_any_keys(['limited'])
Out[12]:
dtype: int
Rows: 72269
[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ... ]
In [13]:
bow.dict_values()[0][:20]
Out[13]:
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1]
In [14]:
sf['bow'] = bow
In [15]:
type(sf['bow'])
Out[15]:
graphlab.data_structures.sarray.SArray
In [16]:
len(sf['bow'])
Out[16]:
72269
In [17]:
sf['bow'][0].items()[:5]
Out[17]:
[('limited', 3),
 ('writings', 2),
 ('personally', 1),
 ('four', 1),
 ('controversial', 1)]
In [18]:
sf['tfidf'] = graphlab.text_analytics.tf_idf(sf['X1'])
In [19]:
sf['tfidf'][0].items()[:5]
Out[19]:
[('limited', 10.04705669672047),
 ('writings', 9.76010421134325),
 ('personally', 5.001941923280662),
 ('four', 2.1272386886969024),
 ('controversial', 4.375805453003677)]
In [20]:
sf.show()
In [22]:
sf
Out[22]:
X1 bow tfidf
aynrand born and educated
in russia rand migrated ...
{'limited': 3,
'writings': 2, ...
{'limited':
10.04705669672047, ...
asphalt in american
english asphalt or ...
{'all': 1, 'accadian': 1,
'similarity': 1, ...
{'all':
1.3891905239989626, ...
actinopterygii the
actinopterygii consti ...
{'andreolepis': 1, 'all':
1, 'evolutionary': 2, ...
{'andreolepis':
11.188150547181156, ...
altaiclanguages these
language families share ...
{'sergei': 3, 'all': 6,
'todays': 1, 'chinese': ...
{'sergei':
20.031873121992916, ...
argon the name argon is
derived from the greek ...
{'limited': 1,
'embolism': 1, ...
{'limited':
3.3490188989068232, ...
augustderleth a 1938
guggenheim fellow der ...
{'evelyn': 1,
'detective': 4, ...
{'evelyn':
6.7937013925087175, ...
amateur amateurism can be
seen in both a negative ...
{'since': 1, 'subpar': 1,
'lack': 2, 'valuable' ...
{'since':
1.8775124538896095, ...
assemblyline an assembly
line is a manufacturing ...
{'all': 3, 'concept': 6,
'consider': 1, 'chine ...
{'all':
4.167571571996888, ...
astronomicalunit an
astronomical unit ...
{'precise': 1, 'all': 2,
'chinese': 1, 'suns': 1, ...
{'precise':
5.491057060675752, 'a ...
abbess an abbess latin
abbatissa feminine form ...
{'kildares': 1, 'they':
4, 'founder': 1, ...
{'kildares':
11.188150547181156, ...
[72269 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [23]:
docs = sf['bow'].dict_trim_by_values(2)
In [24]:
docs = docs.dict_trim_by_keys(graphlab.text_analytics.stopwords(), exclude=True)
In [25]:
m = graphlab.topic_model.create(docs)
Learning a topic model
       Number of documents     72269
           Vocabulary size    171005
   Running collapsed Gibbs sampling
+-----------+---------------+----------------+-----------------+
| Iteration | Elapsed Time  | Tokens/Second  | Est. Perplexity |
+-----------+---------------+----------------+-----------------+
| 10        | 4.01s         | 5.52083e+06    | 0               |
+-----------+---------------+----------------+-----------------+
In [26]:
m
Out[26]:
Class                          : TopicModel

Schema
------
Vocabulary Size                : 171005

Settings
--------
Number of Topics               : 10
alpha                          : 5.0
beta                           : 0.1
Iterations                     : 10
Training time                  : 5.0212
Verbose                        : False

Accessible fields             : 
m['topics']                   : An SFrame containing the topics.
m['vocabulary']               : An SArray containing the words in the vocabulary.
Useful methods                : 
m.get_topics()                : Get the most probable words per topic.
m.predict(new_docs)           : Make predictions for new documents.
In [27]:
m.get_topics()
Out[27]:
topic word score
0 season 0.020896888369
0 league 0.0134398622775
0 film 0.0124702134625
0 line 0.0122774608149
0 club 0.0108237232896
1 school 0.0331154223
1 music 0.0152993491598
1 government 0.015077047931
1 de 0.012720045861
1 college 0.0120683682861
[50 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [28]:
topics = m.get_topics().unstack(['word','score'], new_column_name='topic_words')['topic_words'].apply(lambda x: x.keys())
for topic in topics:
    print topic
['county', 'city', 'people', 'party', 'house']
['album', 'university', 'age', 'state', 'years']
['club', 'league', 'line', 'film', 'season']
['world', 'life', 'made', 'final', 'show']
['released', 'game', '2009', '2007', 'song']
['water', 'found', 'team', 'year', 'played']
['music', 'school', 'de', 'college', 'government']
['town', 'time', 'work', 'family', 'population']
['series', 'region', 'system', 'world', 'research']
['south', 'company', 'back', 'national', 'time']
In [29]:
pred = m.predict(docs)
In [30]:
pred.show()
In [31]:
pred = m.predict(docs, output_type='probabilities')
In [32]:
m['vocabulary']
Out[32]:
dtype: str
Rows: 171005
['duke', 'studies', 'journal', 'chris', 'research', 'matthew', 'crisis', 'financial', 'paul', '1987', 'reagan', 'traditional', 'rightwing', 'nominee', 'libertarianism', 'cato', 'chief', 'smith', 'line', 'south', 'nick', '1999', 'documentary', 'animated', 'shows', 'references', 'commentator', 'powerful', 'ethics', 'rush', 'neil', 'lives', 'cited', 'produced', 'night', 'originality', 'interest', '2007', 'individual', 'authors', 'admirer', 'married', 'club', 'library', 'essays', 'recent', '2009', 'burns', 'inspiration', 'artist', 'women', 'early', 'barbara', 'organized', 'gave', 'referred', 'company', 'personalist', 'criticism', 'john', 'reviewers', 'language', 'understanding', 'writes', 'fewer', 'attention', 'positive', 'masterful', 'review', 'times', 'critic', 'praise', 'theory', 'randian', 'importance', 'calling', 'nonfiction', 'academics', 'kant', 'philosophers', 'italian', 'remarked', 'wife', 'house', 'subject', 'scholarly', 'edward', 'system', 'influence', 'acknowledged', '100', 'branden', 'criticized', 'sacrificing', 'exist', 'selfinterest', 'rational', 'communism', 'journals', 'copies', ... ]
In [33]:
m['topics']
Out[33]:
topic_probabilities vocabulary
[1.47139425641e-07,
1.52261115632e-07, ...
duke
[1.47139425641e-07,
1.52261115632e-07, ...
studies
[1.47139425641e-07,
0.000361011105164, ...
journal
[0.00072407311358,
1.52261115632e-07, ...
chris
[1.47139425641e-07,
1.52261115632e-07, ...
research
[1.47139425641e-07,
1.52261115632e-07, ...
matthew
[1.47139425641e-07,
1.52261115632e-07, ...
crisis
[1.47139425641e-07,
1.52261115632e-07, ...
financial
[1.47139425641e-07,
1.52261115632e-07, ...
paul
[1.47139425641e-07,
1.52261115632e-07, ...
1987
[171005 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [34]:
def print_topics(m):
    topics = m.get_topics(num_words=5)
    topics = topics.unstack(['word','score'], new_column_name='topic_words')['topic_words']
    topics = topics.apply(lambda x: x.keys())
    for topic in topics:
        print topic
print_topics(m)
['county', 'city', 'people', 'party', 'house']
['album', 'university', 'age', 'state', 'years']
['club', 'league', 'line', 'film', 'season']
['world', 'life', 'made', 'final', 'show']
['released', 'game', '2009', '2007', 'song']
['water', 'found', 'team', 'year', 'played']
['music', 'school', 'de', 'college', 'government']
['town', 'time', 'work', 'family', 'population']
['series', 'region', 'system', 'world', 'research']
['south', 'company', 'back', 'national', 'time']
In [36]:
m2 = graphlab.topic_model.create(docs,
                                 num_topics=10,
                                 initial_topics=m['topics'])
Initializing from provided topics and vocabulary.
Learning a topic model
       Number of documents     72269
           Vocabulary size    171005
   Running collapsed Gibbs sampling
+-----------+---------------+----------------+-----------------+
| Iteration | Elapsed Time  | Tokens/Second  | Est. Perplexity |
+-----------+---------------+----------------+-----------------+
| 10        | 3.75s         | 5.71496e+06    | 0               |
+-----------+---------------+----------------+-----------------+
In [37]:
associations = graphlab.SFrame()
associations['word'] = ['recognition']
associations['topic'] = [0]
In [38]:
m2 = graphlab.topic_model.create(docs,
                                 num_topics=20,
                                 num_iterations=50,
                                 associations=associations, 
                                 verbose=False)
Learning a topic model
       Number of documents     72269
           Vocabulary size    171005
   Running collapsed Gibbs sampling
+-----------+---------------+----------------+-----------------+
| Iteration | Elapsed Time  | Tokens/Second  | Est. Perplexity |
+-----------+---------------+----------------+-----------------+
| 10        | 4.26s         | 5.44692e+06    | 0               |
| 20        | 8.01s         | 5.39097e+06    | 0               |
| 30        | 11.58s        | 5.50692e+06    | 0               |
| 40        | 15.35s        | 4.40759e+06    | 0               |
| 50        | 19.24s        | 5.90682e+06    | 0               |
+-----------+---------------+----------------+-----------------+
In [39]:
m2.get_topics(num_words=10)
Out[39]:
topic word score
0 south 0.0166799708982
0 year 0.0149383260486
0 australia 0.0113098992786
0 years 0.00991062905756
0 australian 0.00972827735322
0 canada 0.0090286422427
0 united 0.00848530859305
0 national 0.00755866421794
0 park 0.00724233983287
0 american 0.00721628958939
[200 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
In [40]:
print_topics(m2)
['work', 'series', 'book', 'film', 'published']
['time', 'series', '2009', 'back', 'made']
['son', 'death', 'court', 'family', 'law']
['high', 'students', 'school', 'college', 'university']
['years', '18', 'age', 'population', 'city']
['king', 'god', 'life', 'book', 'church']
['years', 'australian', 'australia', 'south', 'year']
['party', 'state', 'de', 'election', 'french']
['album', 'released', 'song', 'radio', 'show']
['states', 'company', 'united', 'system', 'government']
['building', 'house', 'company', 'york', 'st']
['system', 'theory', 'data', 'number', 'software']
['island', 'english', 'people', 'language', 'century']
['species', 'water', 'land', 'lake', 'area']
['band', 'tour', 'music', 'york', 'time']
['german', 'aircraft', 'force', 'service', 'air']
['large', 'light', 'small', 'made', 'art']
['city', 'line', 'river', 'road', 'west']
['military', 'government', 'general', 'war', 'army']
['league', 'game', 'team', 'games', 'season']
In [ ]: