execute_notebook : A method to load and execute another notebook in this notebook's namespace

cite: http://nbviewer.jupyter.org/gist/minrk/5491090/analysis.ipynb

Call it for your notebook that want to import here.

In [94]:
import io
from nbformat import current

def execute_notebook(nbfile):
    
    with io.open(nbfile) as f:
        nb = current.read(f, 'json')
    
    ip = get_ipython()
    
    for cell in nb.worksheets[0].cells:
        if cell.cell_type != 'code':
            continue
        ip.run_cell(cell.input)
execute_notebook("distributions_fit_and_likelihood.ipynb")

Necessary packages and libraries to connect to Database

In [45]:
db = 'twitterGender'
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL
myDB = URL(drivername='mysql', database=db, query={ 'read_default_file' : '/home/fatal/.my.cnf' })
engine = create_engine(name_or_url=myDB, encoding='utf8')
#conn = engine.connect()

small_val = 0.01

Global Initializations

In [120]:
DISTRIBUTIONS = [
    'zin_norm', 'zin_lognorm', 'zin_powerlaw'
]
kfold = 5
labels = 2

# Generate k random folds of train and test

In [176]:
def generate_kfolds(label_table_name):
    # Load DATA from DB
    id_labels = pd.read_sql('select group_id, label from ' + label_table_name, con=engine)
    group_id_cnt = len(id_labels)
    print ("Total number of individuals: " + str(group_id_cnt))

    # generating k-folds of ids
    #random.shuffle() works for lists
    id_labels = id_labels.sample(frac=1)
    train_fold = [[0 for x in range(labels)] for y in range(kfold)]
    test_fold = [[0 for x in range(labels)] for y in range(kfold)]
    label_prior_prob = [[0 for x in range(labels)] for y in range(kfold)]

    for k in range(kfold):
        test_start = int(group_id_cnt*k/kfold)
        test_end = int(group_id_cnt*(k+1)/kfold)
        # Separate train and test
        test_kth_fold = id_labels[test_start: test_end]
        train_kth_fold = pd.concat([id_labels.iloc[0: test_start], id_labels.iloc[test_end:group_id_cnt]]) 
        total = len(train_kth_fold)
        for l in range(labels): # For each fold, separate data with different labels
            train_fold[k][l] = (train_kth_fold[train_kth_fold.label == l].group_id).tolist()
            label_prior_prob[k][l] = len(train_fold[k][l])/total
            test_fold[k][l] = (test_kth_fold[test_kth_fold.label == l].group_id).tolist()
    return (label_prior_prob, train_fold, test_fold)

 # Build Naive Bayes classifier for each fold

 table_name: Name of the feature table
 
 ### Arguments
 * kfold: number of folds
 * labels: number of labels
 * train_fold: a k by l array. k fold of train set divided based on their labels
 
### Returns
params: a k by l by # of dist array. parameters of each distribution for each train set in each category of labels

In [127]:
def buildNaiveBayes(table_name, train_fold):
    kfold = len(train_fold)
    labels = len(train_fold[0])
    params = [[{} for l in range(labels)] for k in range(kfold)]
    cntr = 1
    query_time = 0
    fit_time = 0
        
    stime = time.time()
    # Load list of distinct features
    features = pd.read_sql('select feat, count(*) cnt from ' + table_name + ' group by feat', con=engine)
    feat_cnt = len(features)
    print(str(feat_cnt) + ' unique features' )
    
    # Load data, feature by feature, and set distribution parameters for each quadruple
    # (fold, gender, feature, dist)
    for index, eachfeat in features.iterrows():
        feat = eachfeat.feat
        qs = time.time()
        feat = feat.replace("'", "''").replace("%", "%%").replace('\\', '\\\\')
        fetch_data_query = "select * from " + table_name + " where feat = '" + feat + "'"
        raw_df = pd.read_sql(fetch_data_query, con=engine)
        qe = time.time()
        
        for k in range(kfold):
            for l in range(labels):
                params[k][l][feat] = {}
                train = raw_df[raw_df.group_id.isin(train_fold[k][l])].group_norm.tolist()
                train += [0] * (len(train_fold[k][l]) - len(train))
                for dist in DISTRIBUTIONS:
                    params[k][l][feat][dist] = mle(np.asarray(train), dist) # calc MLE
                     
        beste = time.time()
        query_time += (qe-qs)
        fit_time += (beste-qe)
        print '\r', str(cntr) + ' out of ' + str(feat_cnt) + ', ' + feat , 
        cntr += 1
    etime = time.time()
    print
    print('overall time: ' + str(round(etime-stime)) + 's -> I/O time: ' + str(round(query_time)) + \
          's, run time: ' + str(round(fit_time)) + 's') 

    return params

# Test Naive Bayes for all Distributions

In [218]:
def test_naiveBayes(table_name, test_fold, label_prior_prob, params):
    kfold = len(test_fold)
    labels = len(test_fold[0])
    tp = {}
    total = 0
    label_tp = [0] * labels
    label_total = [0] * labels
    for l in range(labels):
        label_tp[l] = {}
        for dist in DISTRIBUTIONS:
            tp[dist] = 0
            label_tp[l][dist] = 0
    group_id_cnt = 0
    for k in range(kfold):
        for l in range(labels):
            group_id_cnt += len(test_fold[k][l])
    print ("Total number of test cases: " + str(group_id_cnt))
    cntr = 1
    query_total_time = 0
    run_time = 0
    st = time.time()
    for k in range(kfold):
        for test_l in range(labels): 
            qs = time.time()
            fetch_data_query = "select * from " + table_name + \
                " where group_id in ("+str(test_fold[k][test_l])[1:-1]+")"
            test_df = pd.read_sql(fetch_data_query, con=engine)
            qe = time.time()
            query_total_time += (qe - qs)
            for dist in DISTRIBUTIONS:
                for id, grp in test_df.groupby("group_id"):
                    feats = grp.feat.replace("'", "''").replace("%", "%%").replace('\\', '\\\\')
                    max_prob = float("-inf")
                    for l in range(labels):
                        label_prob = np.log(label_prior_prob[k][l])
                        for feat in feats:
                            data = grp[grp.feat == feat].group_norm.tolist()
                            label_prob += np.log(pdmf(np.asarray(data) , dist, params[k][l][feat][dist]))
                        if label_prob > max_prob:
                            best_label = l
                            max_prob = label_prob
                    # Track total number of test cases
                    total += 1
                    label_total[test_l] += 1
                    # Track correct classifications as tp:True Positive
                    if best_label == test_l:
                        tp[dist] += 1
                        label_tp[test_l][dist] += 1
                    print '\r' + str(cntr) + ' out of ' + str(group_id_cnt*len(tp)),
                    cntr += 1
            run_time += (time.time() - qe)
    total /= len(DISTRIBUTIONS)
    label_total = [lt/len(DISTRIBUTIONS) for lt in label_total]
    et = time.time()
    total_time = et-st
    print 
    print ("overall time: " + str(round(total_time)) + "s -> I/O time: " + str(round(query_total_time)) + \
           "s, run time: " + str(round(run_time)) + "s")
    return (total, label_total, tp, label_tp)

# Table Names

It's important that table_name and label_table_name be synchronized together. 
That's why I put the definition of these two tables together in one chunck.

Table_name is the name of the feature table we want to analyze which can be in each of the cnty, user, or message level. Depending on the level of feature table, label_table_name should be the name of the table containing list of group_ids and labels in the same level.

In [165]:
c_label_table_name = "msgs_cntiesw10u10m_cnties_gender"
cliwc_table_name = "feat$cat_LIWC2007$msgs_cntiesw10u10m$cnty$16to16"
ctopic_table_name = "feat$cat_met_a30_2000_cp_w$msgs_cntiesw10u10m$cnty$16to16"
c1gram_table_name = "feat$1gram$msgs_cntiesw10u10m$cnty$16to16"

u_label_table_name = "msgs_cntiesw10u10m_usrs_gender"
uliwc_table_name = "feat$cat_LIWC2007$msgs_cntiesw10u10m$user_id$16to16"
utopic_table_name = "feat$cat_met_a30_2000_cp_w$msgs_cntiesw10u10m$user_id$16to16"
u1gram_table_name = "feat$1gram$msgs_cntiesw10u10m$user_id$16to16"

m_label_table_name = "msgs_w10u10m_mr20k_msgs_gender"
mliwc_table_name = "feat$cat_LIWC2007$msgs_w10u10m_mr20k$message_id$16to16"
mtopic_table_name = "feat$cat_met_a30_2000_cp_w$msgs_w10u10m_mr20k$message_id$16to16"
m1gram_table_name = "feat$1gram$msgs_w10u10m_mr20k$message_id$16to16"

# Define all kfolds train and test in each level

In [148]:
c_label_prior_prob, c_train_fold, c_test_fold = generate_kfolds(c_label_table_name)

u_label_prior_prob, u_train_fold, u_test_fold = generate_kfolds(u_label_table_name)

m_label_prior_prob, m_train_fold, m_test_fold = generate_kfolds(m_label_table_name)

Total number of individuals: 872
Total number of individuals: 9234
Total number of individuals: 20000


# Train

In [171]:
# LIWC
cliwc_params = buildNaiveBayes(cliwc_table_name, c_train_fold)
uliwc_params = buildNaiveBayes(uliwc_table_name, u_train_fold)
mliwc_params = buildNaiveBayes(mliwc_table_name, m_train_fold)

# TOPICS
ctopic_params = buildNaiveBayes(ctopic_table_name, c_train_fold)
utopic_params = buildNaiveBayes(utopic_table_name, u_train_fold)
mtopic_params = buildNaiveBayes(mtopic_table_name, m_train_fold)

# 1GRAMS
c1gram_params = buildNaiveBayes(c1gram_table_name, c_train_fold)
u1gram_params = buildNaiveBayes(u1gram_table_name, u_train_fold)
m1gram_params = buildNaiveBayes(m1gram_table_name, m_train_fold)


64 unique features
64 out of 64, YOU
overall time: 2.0s -> I/O time: 1.0s, run time: 1.0s
64 unique features
64 out of 64, YOU
overall time: 7.0s -> I/O time: 4.0s, run time: 3.0s
64 unique features
64 out of 64, YOU
overall time: 7.0s -> I/O time: 2.0s, run time: 4.0s
2000 unique features
2000 out of 2000, 999
overall time: 60.0s -> I/O time: 28.0s, run time: 28.0s
2000 unique features
2000 out of 2000, 999
overall time: 432.0s -> I/O time: 263.0s, run time: 95.0s
2000 unique features
2000 out of 2000, 999
overall time: 175.0s -> I/O time: 49.0s, run time: 118.0s
1793072 unique features
1 out of 1793072, !



ValueError: min() arg is an empty sequence

# LIWC , COUNTY

In [219]:
cliwc_tot, cliwc_ltot, cliwc_tp, cliwc_ltp = \
test_naiveBayes(cliwc_table_name, c_test_fold, c_label_prior_prob, cliwc_params)

Total number of test cases: 872
2616 out of 2616
overall time: 237.0s -> I/O time: 0.0s, run time: 237.0s


# LIWC, USER

In [222]:
uliwc_tot, uliwc_ltot, uliwc_tp, uliwc_ltp = \
test_naiveBayes(uliwc_table_name, u_test_fold, u_label_prior_prob, uliwc_params)

Total number of test cases: 9234
27699 out of 27702
overall time: 2372.0s -> I/O time: 4.0s, run time: 2368.0s


# LIWC, MESSAGE

In [224]:
mliwc_tot, mliwc_ltot, mliwc_tp, mliwc_ltp = \
test_naiveBayes(mliwc_table_name, m_test_fold, m_label_prior_prob, mliwc_params)

 Total number of test cases: 20000
57609 out of 60000
overall time: 1359.0s -> I/O time: 3.0s, run time: 1356.0s


# LIWC, Compare Results

In [225]:
print cliwc_tp, "out of", cliwc_tot
print uliwc_tp, uliwc_tot
print mliwc_tp, mliwc_tot
print '\n\n'
print cliwc_ltp, cliwc_ltot
print uliwc_ltp, uliwc_ltot
print mliwc_ltp, mliwc_ltot

{'zin_lognorm': 469, 'zin_norm': 458, 'zin_powerlaw': 406} out of 872.0
{'zin_lognorm': 5081, 'zin_norm': 5645, 'zin_powerlaw': 4290} 9233.0
{'zin_lognorm': 10220, 'zin_norm': 10105, 'zin_powerlaw': 10162} 19203.0



[{'zin_lognorm': 325, 'zin_norm': 301, 'zin_powerlaw': 78}, {'zin_lognorm': 144, 'zin_norm': 157, 'zin_powerlaw': 328}] [436.0, 436.0]
[{'zin_lognorm': 1860, 'zin_norm': 2887, 'zin_powerlaw': 8}, {'zin_lognorm': 3221, 'zin_norm': 2758, 'zin_powerlaw': 4282}] [4943.0, 4290.0]
[{'zin_lognorm': 5962, 'zin_norm': 5996, 'zin_powerlaw': 8198}, {'zin_lognorm': 4258, 'zin_norm': 4109, 'zin_powerlaw': 1964}] [10589.0, 8614.0]


# TOPIC, COUNTY

In [None]:
ctopic_tp, ctopic_ltp = test_naiveBayes(ctopic_table_name, c_test_fold, c_label_prior_prob, ctopic_params)

Total number of test cases: 872
1165 out of 2616 

# TOPIC, USER

In [None]:
utopic_tp, utopic_ltp = test_naiveBayes(utopic_table_name, u_test_fold, u_label_prior_prob, utopic_params)

# TOPIC, MESSAGE

In [None]:
utopic_tp, utopic_ltp = test_naiveBayes(mtopic_table_name, m_test_fold, m_label_prior_prob, mtopic_params)