# Anseri Topic Analysis

## 0. Open a dataset

 ### Dataset Format:

In [None]:
#The recommended approach is to let anseri create and format the SQLite database.
#The developer just needs to be able to convert their data source into an iterable collection of dicts, 
#where each dict has the same keys, and the keys will correspond to columns in a database table. 

#FOR NOW call:

def from_iterable(cls, iterable, dbname, dbpath, contentcols, timecol):
   """Create new database

   Only allows for single-valued attributes for now.

   Args:
       timecol: Name of column containing timestamps, or None.
       dbname: Name of dataset
       dbpath: Path to the database, where it should be created
       iterable: Iterable data source, should be a sequence of dicts.
       contentcols: Names of columns containing text content to be analyzed.

   Returns:
       :py:class:`SQLiteController_FTS`: Instantiated raw content database backend
   """

#Alternatively, you can convert a pandas dataframe into a database:

def from_dataframe(cls, df, dbname, dbpath, contentcols, timecol, dateformat=None, splitter=None):
   """
   Import a pandas dataframe into an sqlite database table.

   Args:
       df (pandas.DataFrame): dataframe
       dbname (str): name of dataset
       dbpath (str): path to database, where it should be created
       contentcols (str | list[str]): names of columns containing content to be analyzed
       timecol (str | None): name of column containing timestamps, or None. If None, will detect
           column named 'time' or 'date'.


       dateformat (str): format used to parse date to convert to unix timestamps
       splitter: `anseri` Content splitter, optionally split columns by sentence.


   Returns:
       :py:class:`SQLiteController_FTS`:  Instantiated raw content database backend
   """
#Splitters allow content to be broken up by sentence or fixed word-length windows. 
#These are classes defined in DatabaseController.pyx, and documentation should show up in the API docs. 
#Splitters accept iterables and are themselves iterable, 
#so they can be used with the from_iterable function as well.

#Database format:

#The sqlite db should contain a table named data, with at least three columns:
#content (string)
#title (string)
#time (int)


## 1. Import Dataset

In [3]:
# Suppress progress notifications for a cleaner notebook
import anseri as ai
ai.disable_progress()  

In [5]:
#import a sql dataset
d = ai.Dataset("aljazeera")

2016-12-07 20:15:14,301 [MainThread  ] [INFO ]  CONFIG FILE: /home/gabriel/.ai/config/aljazeera.cfg


### 2. Select ALL

In [6]:
#Select all the dataset
selection = ai.AllSelection()

In [7]:
# Get the unix timestamps of the min/max times of entries in database
print(d.time_range)

(1300233600, 1361836800)


In [8]:
# Get human-readable representation of timestamps (mth = machine-to-human)
print([ai.utc.mth(x) for x in d.time_range])

['Mar 16 2011', 'Feb 26 2013']


### 3. Select by time window

In [9]:
# Time windows specified human-readable
selection = ai.TimeSelection(('Mar 1 2012', 'Jan 2 2013'))   

In [10]:
# Time windows specified machine-readable
selection = ai.TimeSelection((1300233600, 1350000000)) 

### 4. Attribute Selection

In [11]:
#Select a document in a column
selection = ai.AttributeSelection("author", ["keyword"])

### 5. Full-text Search

In [12]:
# full text search with fuzzy matching
selection = ai.FullTextSelection("topic")    

### 6. Select by keyword (feature) mention

In [13]:
#Select by keyword (feature) mention
selection = ai.FeatureSelection("feature")

## 7. Load data matching selection

In [14]:
# Load the sparse matrix model of corpus
model = d.load(selection)   

In [15]:
# selection.docids contain references to documents matching selection
#length of documents
len(selection.docids)  

8

## 8. Load Raw Content from Database

In [16]:
#Load Raw Content from Database
for doc in d.get_documents_by_id(list(selection.docids)[:3], fields=['title', 'content']):
    print(doc.keys())
    if 'title' in doc.keys():
        print("Title: \t\t{}".format(doc['title']))
    else:
        print("Content: \t{}".format(doc['content']))
        
    print("\n\n")

DOCIDS:[3746, 3654, 12454]
dict_keys(['source', 'docid', 'meta', 'url', 'title', 'time', 'location', 'content'])
Title: 		Pires wins African governance prize



dict_keys(['source', 'docid', 'meta', 'url', 'title', 'time', 'location', 'content'])
Title: 		Israeli scientist wins Nobel chemistry prize



dict_keys(['source', 'docid', 'meta', 'url', 'title', 'time', 'location', 'content'])
Title: 		Facebook launches new social search tool





In [17]:
# Get the names of available fields in content
print(doc.keys())   

dict_keys(['source', 'docid', 'meta', 'url', 'title', 'time', 'location', 'content'])


In [18]:
# Get details about shape of sparse matrix representation of content
n, m = model.shape    
print("n documents: {:,}".format(n))
print("m features: {:,}".format(m))

n documents: 8
m features: 230


## 9. Get Topics

In [19]:
# Instantiate the algorithm
# ignore_terms injects extra stop-words at runtime
SPCA = ai.topicmodels.TopicModelSPCA(n_topics=16, 
                                     card_terms=8, 
                                     card_docs=(n // 10),    # a good rule of thumb is to use 1 to 10% of documents
                                     ignore_terms=["gen", "jan", "feb", "mar", "apr", "may", "jun", "jul", 
                                                   "aug", "sep", "oct", "nov", "dec"])   

In [20]:
# TopicModel object has a convenient representation
# Compute topics within the data model
topics = SPCA(model, ignore_words=["gen", "jan", "feb", "mar", "apr", "may", "jun", "jul", 
                                                   "aug", "sep", "oct", "nov", "dec"])  

## 10. Get Documents Relevant to Topic

In [21]:
# Topics are defined as weighted collections of words. Weights can be found as follows:
for t in topics:
    print(t.weights)

(0.23193282488403202, 0.19871216374480125, 0.14903412280860096, 0.1241951023405008, 0.099356081872400623, 0.074517061404300478, 0.064192961338033039, 0.058059681607330896)
(0.19269414669350601, 0.19269414669350601, 0.10852349552424298, 0.10852349552424298, 0.10852349552424298, 0.096347073346753007, 0.096347073346753007, 0.096347073346753007)
(0.20276627477822559, 0.19585058783266163, 0.10484098086189478, 0.10484098086189478, 0.097925293916330813, 0.097925293916330813, 0.097925293916330813, 0.097925293916330813)
(0.21972490437572595, 0.11544477197350066, 0.11311772155549772, 0.11226279334382383, 0.10986245218786297, 0.10986245218786297, 0.10986245218786297, 0.10986245218786297)
(0.28410912142187672, 0.14205456071093836, 0.14205456071093836, 0.14205456071093836, 0.076645355378900659, 0.07102728035546918, 0.07102728035546918, 0.07102728035546918)
(0.20000000000000001, 0.20000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.10000000000000001, 0.100000000000000

#### NOTE: The first set of words represent the strongest portion of the topic, covering approximately 95% of the topic strength. The words in brackets represent the total list of words defining the topic.

In [29]:
# Get strongest examples of a single topic
for row in ai.topicmodels.TopicDocumentRecommendation(topics[0], model, n_docs=20):
    if 'title' in row.keys():
        print(row['title'])

ValueError: row, column, and data array must all be the same length

In [30]:
# Get strongest examples of each topic in a collection of topics
recommendations = ai.topicmodels.TopicDocumentRecommendation(topics, model, n_docs=5)
for i, topicdocs in enumerate(recommendations):
    print("TOPIC {}".format(i+1))
    for row in topicdocs:
        if 'title' in row.keys():
            print(row['title'])

ValueError: row, column, and data array must all be the same length

#### Note: Defining document display
#### Different datasets use different schemas. The following code snippet shows you how to determine the names of columns in your dataset so you can decide how to display your content.

## 11. Subtopics

In [31]:
selected_topic = 1
print(topics[selected_topic])

death singer involuntary: [death, singer, involuntary, murray, star, trial, ruled, guilty]


In [32]:
# TopicSelection defines a topic "mention" based on a threshold of topic strength,
# either 'abs' for a concrete value threshold, or 'quantile' to specify a minimum quantile threshold
# Then, all documents "mentioning" the given topic are selected.

subtopic_sel = ai.topicmodels.TopicSelection(topics[selected_topic], thresh='quantile', thresh_val=20.)

# Make the selection concrete by passing in the model
subtopic_sel(model)    

ValueError: row, column, and data array must all be the same length

In [None]:
# Data models can be sliced by a selection
submodel = model[subtopic_sel]   

In [None]:
# ignore words from already discovered topics
subtopics = SPCA(submodel, ignore_words=[k for t in topics for k in t.features])  
print(subtopics)