# Anseri Topic Analysis Tutorial

## 0. Open a dataset

 ### Dataset Format:

In [None]:
#The recommended approach is to let anseri create and format the SQLite database.
#The developer just needs to be able to convert their data source into an iterable collection of dicts, 
#where each dict has the same keys, and the keys will correspond to columns in a database table. 

#FOR NOW call:

def from_iterable(cls, iterable, dbname, dbpath, contentcols, timecol):
   """Create new database

   Only allows for single-valued attributes for now.

   Args:
       timecol: Name of column containing timestamps, or None.
       dbname: Name of dataset
       dbpath: Path to the database, where it should be created
       iterable: Iterable data source, should be a sequence of dicts.
       contentcols: Names of columns containing text content to be analyzed.

   Returns:
       :py:class:`SQLiteController_FTS`: Instantiated raw content database backend
   """

#Alternatively, you can convert a pandas dataframe into a database:

def from_dataframe(cls, df, dbname, dbpath, contentcols, timecol, dateformat=None, splitter=None):
   """
   Import a pandas dataframe into an sqlite database table.

   Args:
       df (pandas.DataFrame): dataframe
       dbname (str): name of dataset
       dbpath (str): path to database, where it should be created
       contentcols (str | list[str]): names of columns containing content to be analyzed
       timecol (str | None): name of column containing timestamps, or None. If None, will detect
           column named 'time' or 'date'.


       dateformat (str): format used to parse date to convert to unix timestamps
       splitter: `anseri` Content splitter, optionally split columns by sentence.


   Returns:
       :py:class:`SQLiteController_FTS`:  Instantiated raw content database backend
   """
#Splitters allow content to be broken up by sentence or fixed word-length windows. 
#These are classes defined in DatabaseController.pyx, and documentation should show up in the API docs. 
#Splitters accept iterables and are themselves iterable, 
#so they can be used with the from_iterable function as well.

#Database format:

#The sqlite db should contain a table named data, with at least three columns:
#content (string)
#title (string)
#time (int)



### Start here:

In [1]:
import anseri as ai
import numpy as np  

ai.disable_progress()  # Suppress progress notifications for a cleaner notebook

#import a sql dataset
d = ai.Dataset("aljazeera")

2016-12-07 20:10:01,675 [MainThread  ] [INFO ]  CONFIG FILE: /home/gabriel/.ai/config/aljazeera.cfg


## 1. Select Data

### Select ALL

In [3]:
#Select all the dataset
selection = ai.AllSelection()

### Select by time window

In [4]:
print(d.time_range)   # Get the unix timestamps of the min/max times of entries in database
print([ai.utc.mth(x) for x in d.time_range])   # Get human-readable representation of timestamps (mth = machine-to-human)

(1300233600, 1361836800)
['Mar 16 2011', 'Feb 26 2013']


In [5]:
selection = ai.TimeSelection(('Mar 1 2012', 'Jan 2 2013'))   # Time windows specified human-readable
selection = ai.TimeSelection((1300233600, 1350000000))  # Time windows specified machine-readable

### Attribute Selection

The aljazeera dataset has no defined data attributes apart from time. 

If the dataset had a column defined as an attribute, named "author", you could select documents authored by "Marwan Bishara" as follows:

In [6]:
# selection = ai.AttributeSelection("author", ["Marwan Bishara"])

### Full-text Search

In [7]:
selection = ai.FullTextSelection("iraq war")    # full text search with fuzzy matching

### Select by keyword (feature) mention

In [8]:
selection = ai.FeatureSelection("iraq")

## 2. Load data matching selection

In [9]:
model = d.load(selection)   # Load the sparse matrix model of corpus

In [10]:
#length of documents
len(selection.docids)   # selection.docids contain references to documents matching selection

193

### Load Raw Content from Database

In [11]:
for doc in d.get_documents_by_id(list(selection.docids)[:3], fields=['title', 'content']):
    print(doc.keys())
    if 'title' in doc.keys():
        print("Title: \t\t{}".format(doc['title']))
    else:
        print("Content: \t{}".format(doc['content']))

    print("\n\n")

DOCIDS:[12802, 11276, 11789]
dict_keys(['time', 'source', 'url', 'meta', 'location', 'title', 'docid', 'content'])
Title: 		Iraq Sunnis rally against Shia-led government



dict_keys(['time', 'source', 'url', 'meta', 'location', 'title', 'docid', 'content'])
Title: 		Iraqi minister says arms deal 'not cancelled'



dict_keys(['time', 'source', 'url', 'meta', 'location', 'title', 'docid', 'content'])
Title: 		US blacklists Syrian rebel group al-Nusra





In [12]:
print(doc.keys())    # Get the names of available fields in content

dict_keys(['time', 'source', 'url', 'meta', 'location', 'title', 'docid', 'content'])


In [13]:
n, m = model.shape     # Get details about shape of sparse matrix representation of content
print("n documents: {:,}".format(n))
print("m features: {:,}".format(m))

n documents: 193
m features: 3,061


## 3. Get Topics

In [14]:
# Instantiate the algorithm
# ignore_terms injects extra stop-words at runtime

SPCA = ai.topicmodels.TopicModelSPCA(n_topics=16, 
                                     card_terms=8, 
                                     card_docs=(n // 10),    # a good rule of thumb is to use 1 to 10% of documents
                                     ignore_terms=["gen", "jan", "feb", "mar", "apr", "may", "jun", "jul", 
                                                   "aug", "sep", "oct", "nov", "dec"])   

In [15]:
topics = SPCA(model, ignore_words=["gen", "jan", "feb", "mar", "apr", "may", "jun", "jul", 
                                                   "aug", "sep", "oct", "nov", "dec"])  # Compute topics within the data model

In [16]:
print(topics)    # TopicModel object has a convenient representation

police people sunday: [police, people, sunday, man, wounding, government, region, fire]
---
authorities country warned: [authorities, country, warned, cyclone, security, monday, forces, south]
---
troops afghan killed: [troops, afghan, killed, army, nato, fighters, village, coalition]
---
president free putin: [president, free, putin, march, protest, presidential, victory, attended]
---
tuesday company control: [tuesday, company, control, news, minister, international, newspaper, announced]
---
saudi group arabia: [saudi, group, arabia, rights, anti, amnesty, organisation, law]
---
official city capital: [official, city, capital, aden, mansoura, khalidi, highway, gathered]
---
told election technical: [told, election, technical, libya, july, saturday, conference, day]
---
court case year: [court, case, year, special, ruling, military, collapse, taking]
---
state storm iraq: [state, storm, iraq, coast, america, destructive, calamity, struck]
---
syrian wednesday human: [syrian, wednesda

In [17]:
# Topics are defined as weighted collections of words. Weights can be found as follows:
for t in topics:
    print(t.weights)

(0.43987543886053287, 0.16135492030010024, 0.081614188961369932, 0.069195771551227178, 0.068277589993257087, 0.062424893018241806, 0.061976737518494703, 0.055280459796776205)
(0.32373129752141483, 0.15458006486453921, 0.12155516609939153, 0.11557446612101027, 0.075836923461567854, 0.075097925410003291, 0.06811368921552477, 0.065510467306548234)
(0.23670078458539806, 0.18001495931627137, 0.11557741617425035, 0.11399443600735484, 0.10243402907015549, 0.087713377684911428, 0.086238677994135718, 0.077326319167522611)
(0.2052969677801823, 0.14859840531110766, 0.1291993781820866, 0.10733279924363351, 0.10733279924363351, 0.10694230149406239, 0.10435608227851084, 0.09094126646678323)
(0.33027794462335425, 0.15415144734695396, 0.11262263272299203, 0.093226517829891109, 0.084421468300163074, 0.08305788285091581, 0.07151415310358307, 0.0707279532221467)
(0.24266940530128109, 0.16671756493599579, 0.13360872842932917, 0.12005156472885908, 0.086403191703952553, 0.084608808891324633, 0.0837531350997

**NOTE**: The first set of words represent the strongest portion of the topic, covering approximately 95% of the topic strength. The words in brackets represent the total list of words defining the topic.

## 2. Get Documents Relevant to Topic

In [18]:
topics.mat

ValueError: row, column, and data array must all be the same length

### Get Document Recommendations

In [18]:
# Get strongest examples of a single topic
for row in ai.topicmodels.TopicDocumentRecommendation(topics[0], model, n_docs=20):
    if 'title' in row.keys():
        print(row['title'])

# Get strongest examples of each topic in a collection of topics
recommendations = ai.topicmodels.TopicDocumentRecommendation(topics, model, n_docs=5)
for i, topicdocs in enumerate(recommendations):
    print("TOPIC {}".format(i+1))
    for row in topicdocs:
        if 'title' in row.keys():
            print(row['title'])

SHAPE SCORE: (158, 1)
DOCIDS:[12159, 8734, 6343, 4774, 9265, 11551, 10387, 6568, 5815, 5526, 12568, 8848, 1456, 2893, 8306, 10569, 6834, 3251, 10495, 5420]
Series of deadly attacks hit Iraq
Dozens dead and wounded in Iraq bombings
Dozens dead in wave of Iraq attacks
Attacks leave many dead in Iraq
Spate of deadly attacks across Iraq
Deaths in Iraq bomb explosions
Security forces targeted in Iraq attacks
Multiple Iraq attacks leave many dead
Deaths in attacks on Iraq's Sunni districts
Many deaths in series of Iraq attacks
Deaths reported in Iraq suicide blasts
Deaths in Iraq car bomb attack
Five US troops killed in Iraq attack
Dozen killed in Iraq violence
Blast strikes Shia charity office in Iraq
Al-Qaeda group takes credit for Iraq attacks
Dozens dead in string of Iraq blasts
Trio of violent attacks strike Iraq
Iraq sees deadliest month in over two years
Attacks on Iraq's Shias leave scores dead
SHAPE SCORE: (158, 16)
DOCIDS:[12159, 6343, 4774, 8734, 5815]
DOCIDS:[8826, 12816, 1404, 2

### Note: Defining document display

Different datasets use different schemas. The following code snippet shows you how to determine the names of columns in your dataset so you can decide how to display your content.

In [19]:
# First, get name of tables. In this example, we are interested in "Data".

print(d.db_controller.get_table_names())

['content', 'content_content', 'content_segments', 'content_segdir', 'content_docsize', 'content_stat', 'content_fragindex', 'title', 'title_content', 'title_segments', 'title_segdir', 'title_docsize', 'title_stat', 'title_fragindex', 'attributes']


In [19]:
# Example: Print title and content fields

#for topdoc in recommendations[3]:
#    print("""-- {title} --\n{content}\n\n""".format(**topdoc))

### Export to CSV

In [20]:
import csv
def export_to_csv(fname, topics, topdocs, document_fields=None):
    """
    Export topics and corresponding recommended topic docs to csv file
    """
    if not document_fields:
        document_fields = ['title', 'content']
        
    colnames = ['topic_index', 'topic_name'] + document_fields

    with open(fname+".csv", 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=colnames)
        writer.writeheader()

        for i, (topic, docs) in enumerate(zip(topics, topdocs)):
            record = {"topic_index": i, "topic_name": topic.name}
            for doc in docs:
                for df in document_fields:
                    if df in record:
                        record[df] = doc[df]
                writer.writerow(record)

In [21]:
export_to_csv('aljazeera_topics', topics, recommendations)

NameError: name 'recommendations' is not defined

## 3. Subtopics

In [22]:
selected_topic = 1
print(topics[selected_topic])

authorities country warned: [authorities, country, warned, cyclone, security, monday, forces, south]


In [23]:
# TopicSelection defines a topic "mention" based on a threshold of topic strength,
# either 'abs' for a concrete value threshold, or 'quantile' to specify a minimum quantile threshold
# Then, all documents "mentioning" the given topic are selected.

subtopic_sel = ai.topicmodels.TopicSelection(topics[selected_topic], thresh='quantile', thresh_val=20.)
subtopic_sel(model)    # Make the selection concrete by passing in the model

ValueError: row, column, and data array must all be the same length

In [24]:
submodel = model[subtopic_sel]    # Data models can be sliced by a selection

ValueError: Selection object must be concrete to be used to slice a data model.

In [25]:
subtopics = SPCA(submodel, ignore_words=[k for t in topics for k in t.features])   # ignore words from already discovered topics
print(subtopics)

NameError: name 'submodel' is not defined

# 4. Regression

In [26]:
lma = ai.linearmodels.LinearModelRS(rho=0.001)

# What is the difference between Iraq and Iran?
pos_sel = ai.FeatureSelection("iraq") - ai.FeatureSelection("iran")
neg_sel = ai.FeatureSelection("iran") - ai.FeatureSelection("iraq")
d.select(pos_sel)
d.select(neg_sel)
model = d.load(pos_sel + neg_sel)
classvec = model.get_classification_vector({1.: pos_sel, -1.: neg_sel})
print(np.min(classvec.data))
print(np.max(classvec.data))

selection_map = {1: pos_sel, -1: neg_sel}
selection_map = [(s, v) for v, s in iter(selection_map.items())]

# ignore words: iraq, iran
model._mat = ai.data_conditioning.remove_cols(model.mat, model.feature_id_to_col(d.get_feature_ids(['iraq', 'iran'])))
# Compute the solution
linear_model = lma(model, classvec)

-1.0
1.0


In [27]:
v = np.argsort(np.array(linear_model.params).ravel())[::-1][:20]   # Get the top 20 

In [28]:
z = d.get_features_by_id(model.col_to_feature_id(v))

In [29]:
for k in z:
    print(k)

tamaulipas
overturned
ukraine
flown
holed
tshisekedi
kabila
ioc
opposed
shia
food
hunger
reportedly
gas
region
july
tymoshenko
famine
players
armstrong


In [30]:
# What are the image words for "iraq" in the news?
lma = ai.linearmodels.LinearModelRS(rho=0.0001)
pos_sel = ai.FeatureSelection("iraq")
neg_sel = ai.AllSelection() - ai.FeatureSelection("iraq")
d.select(pos_sel)
d.select(neg_sel)
model = d.load(ai.AllSelection())
classvec = model.get_classification_vector({1.: pos_sel, -1.: neg_sel})
selection_map = {1: pos_sel, -1: neg_sel}
selection_map = [(s, v) for v, s in iter(selection_map.items())]
# ignore words: iraq
model._mat = ai.data_conditioning.remove_cols(model.mat, model.feature_id_to_col(d.get_feature_ids(['iraq'])))
# Compute the solution
linear_model = lma(model, classvec)
v = np.argsort(np.array(linear_model.params).ravel())[::-1][:20]   # Get the top 20 
z = d.get_features_by_id(model.col_to_feature_id(v))


Error accessing d=13288


IndexError: Index out of range

In [31]:
for k, w in zip(z, np.array(linear_model.params).ravel()[v]):
    print("{k}: {w:.2}".format(k=k, w=w))

tamaulipas: 0.031
overturned: 0.025
ukraine: 0.021
flown: 0.018
holed: 0.018
tshisekedi: 0.017
kabila: 0.017
ioc: 0.017
opposed: 0.017
shia: 0.016
food: 0.016
hunger: 0.016
reportedly: 0.015
gas: 0.015
region: 0.014
july: 0.014
tymoshenko: 0.013
famine: 0.013
players: 0.013
armstrong: 0.013


                            ##########################################################

# Anseri

Main Anseri functions:

## 1. Import Dataset

In [32]:
# Suppress progress notifications for a cleaner notebook
ai.disable_progress()  

In [33]:
#import a sql dataset
d = ai.Dataset("sql_dataset")

OSError: Config file for dataset sql_dataset not found.

### 2. Select ALL

In [34]:
#Select all the dataset
selection = ai.AllSelection()

In [35]:
# Get the unix timestamps of the min/max times of entries in database
print(d.time_range)

(1300233600, 1361836800)


In [36]:
# Get human-readable representation of timestamps (mth = machine-to-human)
print([ai.utc.mth(x) for x in d.time_range])

['Mar 16 2011', 'Feb 26 2013']


### 3. Select by time window

In [37]:
# Time windows specified human-readable
selection = ai.TimeSelection(('Mar 1 2012', 'Jan 2 2013'))   

In [38]:
# Time windows specified machine-readable
selection = ai.TimeSelection((1300233600, 1350000000)) 

### 4. Attribute Selection

In [39]:
#Select a document in a column
selection = ai.AttributeSelection("author", ["keyword"])

### 5. Full-text Search

In [40]:
# full text search with fuzzy matching
selection = ai.FullTextSelection("topic")    

### 6. Select by keyword (feature) mention

In [41]:
#Select by keyword (feature) mention
selection = ai.FeatureSelection("feature")

## 7. Load data matching selection

In [42]:
# Load the sparse matrix model of corpus
model = d.load(selection)   

In [43]:
# selection.docids contain references to documents matching selection
#length of documents
len(selection.docids)  

8

## 8. Load Raw Content from Database

In [44]:
#Load Raw Content from Database
for doc in d.get_documents_by_id(list(selection.docids)[:3], fields=['title', 'content']):
    print(doc.keys())
    if 'title' in doc.keys():
        print("Title: \t\t{}".format(doc['title']))
    else:
        print("Content: \t{}".format(doc['content']))
        
    print("\n\n")

DOCIDS:[3746, 3654, 12454]
dict_keys(['time', 'source', 'url', 'meta', 'location', 'title', 'docid', 'content'])
Title: 		Pires wins African governance prize



dict_keys(['time', 'source', 'url', 'meta', 'location', 'title', 'docid', 'content'])
Title: 		Israeli scientist wins Nobel chemistry prize



dict_keys(['time', 'source', 'url', 'meta', 'location', 'title', 'docid', 'content'])
Title: 		Facebook launches new social search tool





In [45]:
# Get the names of available fields in content
print(doc.keys())   

dict_keys(['time', 'source', 'url', 'meta', 'location', 'title', 'docid', 'content'])


In [46]:
# Get details about shape of sparse matrix representation of content
n, m = model.shape    
print("n documents: {:,}".format(n))
print("m features: {:,}".format(m))

n documents: 8
m features: 230


## 9. Get Topics

In [47]:
# Instantiate the algorithm
# ignore_terms injects extra stop-words at runtime
SPCA = ai.topicmodels.TopicModelSPCA(n_topics=16, 
                                     card_terms=8, 
                                     card_docs=(n // 10),    # a good rule of thumb is to use 1 to 10% of documents
                                     ignore_terms=["gen", "jan", "feb", "mar", "apr", "may", "jun", "jul", 
                                                   "aug", "sep", "oct", "nov", "dec"])   

In [48]:
# TopicModel object has a convenient representation
# Compute topics within the data model
topics = SPCA(model, ignore_words=["gen", "jan", "feb", "mar", "apr", "may", "jun", "jul", 
                                                   "aug", "sep", "oct", "nov", "dec"])  

## 10. Get Documents Relevant to Topic

In [49]:
# Topics are defined as weighted collections of words. Weights can be found as follows:
for t in topics:
    print(t.weights)

(0.23193282488403202, 0.19871216374480125, 0.14903412280860096, 0.1241951023405008, 0.099356081872400623, 0.074517061404300478, 0.064192961338033039, 0.058059681607330896)
(0.19269414669350601, 0.19269414669350601, 0.10852349552424298, 0.10852349552424298, 0.10852349552424298, 0.096347073346753007, 0.096347073346753007, 0.096347073346753007)
(0.20276627477822559, 0.19585058783266163, 0.10484098086189478, 0.10484098086189478, 0.097925293916330813, 0.097925293916330813, 0.097925293916330813, 0.097925293916330813)
(0.21972490437572595, 0.11544477197350066, 0.11311772155549772, 0.11226279334382383, 0.10986245218786297, 0.10986245218786297, 0.10986245218786297, 0.10986245218786297)
(0.28410912142187672, 0.14205456071093836, 0.14205456071093836, 0.14205456071093836, 0.076645355378900659, 0.07102728035546918, 0.07102728035546918, 0.07102728035546918)
(0.20000000000000001, 0.20000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.100000000000000

#### NOTE: The first set of words represent the strongest portion of the topic, covering approximately 95% of the topic strength. The words in brackets represent the total list of words defining the topic.

In [50]:
# Get strongest examples of a single topic
for row in ai.topicmodels.TopicDocumentRecommendation(topics[0], model, n_docs=20):
    if 'title' in row.keys():
        print(row['title'])

ValueError: row, column, and data array must all be the same length

In [51]:
# Get strongest examples of each topic in a collection of topics
recommendations = ai.topicmodels.TopicDocumentRecommendation(topics, model, n_docs=5)
for i, topicdocs in enumerate(recommendations):
    print("TOPIC {}".format(i+1))
    for row in topicdocs:
        if 'title' in row.keys():
            print(row['title'])

ValueError: row, column, and data array must all be the same length

#### Note: Defining document display
#### Different datasets use different schemas. The following code snippet shows you how to determine the names of columns in your dataset so you can decide how to display your content.

## 11. Subtopics

In [52]:
selected_topic = 1
print(topics[selected_topic])

death singer involuntary: [death, singer, involuntary, murray, star, trial, ruled, guilty]


In [53]:
# TopicSelection defines a topic "mention" based on a threshold of topic strength,
# either 'abs' for a concrete value threshold, or 'quantile' to specify a minimum quantile threshold
# Then, all documents "mentioning" the given topic are selected.

subtopic_sel = ai.topicmodels.TopicSelection(topics[selected_topic], thresh='quantile', thresh_val=20.)

# Make the selection concrete by passing in the model
subtopic_sel(model)    

ValueError: row, column, and data array must all be the same length

In [54]:
# Data models can be sliced by a selection
submodel = model[subtopic_sel]   

ValueError: Selection object must be concrete to be used to slice a data model.

In [55]:
# ignore words from already discovered topics
subtopics = SPCA(submodel, ignore_words=[k for t in topics for k in t.features])  
print(subtopics)

NameError: name 'submodel' is not defined