execute_notebook : A method to load and execute another notebook in this notebook's namespace

cite: http://nbviewer.jupyter.org/gist/minrk/5491090/analysis.ipynb

Call it for your notebook that want to import here.

In [145]:
import io
from nbformat import current

def execute_notebook(nbfile):
    
    with io.open(nbfile) as f:
        nb = current.read(f, 'json')
    
    ip = get_ipython()
    
    for cell in nb.worksheets[0].cells:
        if cell.cell_type != 'code':
            continue
        ip.run_cell(cell.input)
execute_notebook("distributions_fit_and_likelihood.ipynb")

Necessary packages and libraries to connect to Database

In [37]:
db = 'twitterGender'
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL
myDB = URL(drivername='mysql', database=db, query={ 'read_default_file' : '/home/fatal/.my.cnf' })
engine = create_engine(name_or_url=myDB, encoding='utf8')
#conn = engine.connect()

small_val = 0.01

Global Initializations

In [38]:
DISTRIBUTIONS = [
    'zin_norm', 'zin_lognorm', 'zin_powerlaw'
]
kfold = 5
labels = 2

# Generate k random folds of train and test

In [39]:
def generate_kfolds(label_table_name):
    # Load DATA from DB
    id_labels = pd.read_sql('select group_id, label from ' + label_table_name, con=engine)
    group_id_cnt = len(id_labels)
    print ("Total number of individuals: " + str(group_id_cnt))

    # generating k-folds of ids
    #random.shuffle() works for lists
    id_labels = id_labels.sample(frac=1)
    train_fold = [[0 for x in range(labels)] for y in range(kfold)]
    test_fold = [[0 for x in range(labels)] for y in range(kfold)]
    label_prior_prob = [[0 for x in range(labels)] for y in range(kfold)]

    for k in range(kfold):
        test_start = int(group_id_cnt*k/kfold)
        test_end = int(group_id_cnt*(k+1)/kfold)
        # Separate train and test
        test_kth_fold = id_labels[test_start: test_end]
        train_kth_fold = pd.concat([id_labels.iloc[0: test_start], id_labels.iloc[test_end:group_id_cnt]]) 
        total = len(train_kth_fold)
        for l in range(labels): # For each fold, separate data with different labels
            train_fold[k][l] = (train_kth_fold[train_kth_fold.label == l].group_id).tolist()
            label_prior_prob[k][l] = len(train_fold[k][l])/total
            test_fold[k][l] = (test_kth_fold[test_kth_fold.label == l].group_id).tolist()
    return (label_prior_prob, train_fold, test_fold)

 # Build Naive Bayes classifier for each fold

 table_name: Name of the feature table
 
 ### Arguments
 * kfold: number of folds
 * labels: number of labels
 * train_fold: a k by l array. k fold of train set divided based on their labels
 
### Returns
params: a k by l by # of dist array. parameters of each distribution for each train set in each category of labels

In [96]:
def buildNaiveBayes(table_name, train_fold):
    kfold = len(train_fold)
    labels = len(train_fold[0])
    params = [[{} for l in range(labels)] for k in range(kfold)]
    defaults = [[init_default() for l in range(labels)] for k in range(kfold)]
    cntr = 1
    query_time = 0
    fit_time = 0
        
    stime = time.time()
    # Load list of distinct features
    features = pd.read_sql('select feat, count(*) cnt from ' + table_name + ' group by feat', con=engine)
    feat_cnt = len(features)
    print(str(feat_cnt) + ' unique features' )
    
    # Load data, feature by feature, and set distribution parameters for each quadruple
    # (fold, gender, feature, dist)
    for index, eachfeat in features.iterrows():
        feat = eachfeat.feat
        qs = time.time()
        feat2search = feat.replace("'", "''").replace("%", "%%").replace("\\", "\\\\")
        fetch_data_query = "select * from " + table_name + " where feat = '" + feat2search + "'"
        raw_df = pd.read_sql(fetch_data_query, con=engine)
        qe = time.time()

        for k in range(kfold):
            for l in range(labels):
                params[k][l][feat] = {}
                train = raw_df[raw_df.group_id.isin(train_fold[k][l])].group_norm.tolist()
                train += [0] * (len(train_fold[k][l]) - len(train))
                train = np.asarray(train) # input of functions "update_default" and "mle" should be nparray
                defaults[k][l] = update_default(train, defaults[k][l])
                for dist in DISTRIBUTIONS:
                    params[k][l][feat][dist] = mle(train, dist) # calc MLE
                     
        beste = time.time()
        query_time += (qe-qs)
        fit_time += (beste-qe)
        print '\r', str(cntr) + ' out of ' + str(feat_cnt) + ', ' + feat , 
        cntr += 1
    etime = time.time()
    print
    print('overall time: ' + str(round(etime-stime)) + 's -> I/O time: ' + str(round(query_time)) + \
          's, run time: ' + str(round(fit_time)) + 's') 

    return params, defaults

# Test Naive Bayes for all Distributions

In [133]:
def test_naiveBayes(table_name, test_fold, label_prior_prob, params, defaults):
    kfold = len(test_fold)
    labels = len(test_fold[0])
    tp = {}
    total = 0
    label_tp = [0] * labels
    label_total = [0] * labels
    # as we should have a distribution for all features in all kfolds and labels, it doesn't matter which k or l
    # we choose to get list of all features from 
    all_feats = set(params[0][0].keys())
    for l in range(labels):
        label_tp[l] = {}
        for dist in DISTRIBUTIONS:
            tp[dist] = 0
            label_tp[l][dist] = 0
    group_id_cnt = 0
    for k in range(kfold):
        for l in range(labels):
            group_id_cnt += len(test_fold[k][l])
    print ("Total number of test cases: " + str(group_id_cnt))
    cntr = 1
    query_total_time = 0
    run_time = 0
    st = time.time()
    for k in range(kfold):
        for test_l in range(labels): 
            qs = time.time()
            fetch_data_query = "select * from " + table_name + \
                " where group_id in ("+str(test_fold[k][test_l])[1:-1]+")"
            test_df = pd.read_sql(fetch_data_query, con=engine)
            qe = time.time()
            query_total_time += (qe - qs)
            for id, grp in test_df.groupby("group_id"):
                s1 = time.time()
                zero_feats = all_feats - set(grp.feat) # select features that have'nt been used by this id
                t3 = time.time()
                for dist in DISTRIBUTIONS:
                    max_prob = float("-inf")
                    best_label = -1
                    for l in range(labels): # go through all classes to find best class as the match label
                        label_prob = np.log(label_prior_prob[k][l])                                            
                        t4 = time.time()
                        for idx, row in grp.iterrows(): # go over all available features of an individual
                            data = [row.group_norm]
                            label_prob += np.log(pdmf(np.asarray(data) , dist, params[k][l][row.feat][dist],\
                                                     defaults[k][l]))
                            
                        t5 = time.time()
                        for feat in zero_feats: # go over all non-available features of an individual
                            label_prob += params[k][l][feat][dist]['logp0']
                        t6 = time.time()
                        if label_prob > max_prob: # choose best label with maximum probability
                            best_label = l
                            max_prob = label_prob
                    # Track correct classifications as tp:True Positive
                    if best_label == test_l:
                        tp[dist] += 1
                        label_tp[test_l][dist] += 1
                e1 = time.time()
                print '\r' + str(cntr) + ' out of ' + str(group_id_cnt), str(round(e1-s1, 2)), 
                cntr += 1
                # Track total number of test cases
                total += 1
                label_total[test_l] += 1
            run_time += (time.time() - qe)
    et = time.time()
    total_time = et-st
    print 
    print ("overall time: " + str(round(total_time)) + "s -> I/O time: " + str(round(query_total_time)) + \
           "s, run time: " + str(round(run_time)) + "s")
    return (total, label_total, tp, label_tp)

# Table Names

It's important that table_name and label_table_name be synchronized together. 
That's why I put the definition of these two tables together in one chunck.

Table_name is the name of the feature table we want to analyze which can be in each of the cnty, user, or message level. Depending on the level of feature table, label_table_name should be the name of the table containing list of group_ids and labels in the same level.

In [42]:
c_label_table_name = "msgs_cntiesw10u10m_cnties_gender"
cliwc_table_name = "feat$cat_LIWC2007$msgs_cntiesw10u10m$cnty$16to16"
ctopic_table_name = "feat$cat_met_a30_2000_cp_w$msgs_cntiesw10u10m$cnty$16to16"
c1gram_table_name = "feat$1gram$msgs_cntiesw10u10m$cnty$16to16$0_05"

u_label_table_name = "msgs_cntiesw10u10m_usrs_gender"
uliwc_table_name = "feat$cat_LIWC2007$msgs_cntiesw10u10m$user_id$16to16"
utopic_table_name = "feat$cat_met_a30_2000_cp_w$msgs_cntiesw10u10m$user_id$16to16"
u1gram_table_name = "feat$1gram$msgs_cntiesw10u10m$user_id$16to16$0_01"

m_label_table_name = "msgs_w10u10m_mr20k_msgs_gender"
mliwc_table_name = "feat$cat_LIWC2007$msgs_w10u10m_mr20k$message_id$16to16"
mtopic_table_name = "feat$cat_met_a30_2000_cp_w$msgs_w10u10m_mr20k$message_id$16to16"
m1gram_table_name = "feat$1gram$msgs_w10u10m_mr20k$message_id$16to16$0_0005"

# Define all kfolds train and test in each level

In [43]:
c_label_prior_prob, c_train_fold, c_test_fold = generate_kfolds(c_label_table_name)

u_label_prior_prob, u_train_fold, u_test_fold = generate_kfolds(u_label_table_name)

m_label_prior_prob, m_train_fold, m_test_fold = generate_kfolds(m_label_table_name)

Total number of individuals: 872
Total number of individuals: 9234
Total number of individuals: 20000


# Train

In [116]:
# LIWC
print '\nLIWC, COUNTY'; cliwc_params, cliwc_defaults = buildNaiveBayes(cliwc_table_name, c_train_fold)
print '\nLIWC, USER'; uliwc_params, uliwc_defaults = buildNaiveBayes(uliwc_table_name, u_train_fold)
print '\nLIWC, MESSAGE'; mliwc_params, mliwc_defaults = buildNaiveBayes(mliwc_table_name, m_train_fold)

# TOPICS
print '\nTOPICS, COUNTY'; ctopic_params, ctopic_defaults = buildNaiveBayes(ctopic_table_name, c_train_fold)
print '\nTOPICS, USER'; utopic_params, utopic_defaults = buildNaiveBayes(utopic_table_name, u_train_fold)
print '\nTOPICS, MESSAGE'; mtopic_params, mtopic_defaults = buildNaiveBayes(mtopic_table_name, m_train_fold)

# 1GRAMS
print '\n1GRAM, COUNTY'; c1gram_params, c1gram_defaults = buildNaiveBayes(c1gram_table_name, c_train_fold)
print '\n1GRAM, USER'; u1gram_params, u1gram_defaults = buildNaiveBayes(u1gram_table_name, u_train_fold)
print '\n1GRAM, MESSAGE'; m1gram_params, m1gram_defaults = buildNaiveBayes(m1gram_table_name, m_train_fold)


 
LIWC, COUNTY
64 unique features
64 out of 64, YOU
overall time: 2.0s -> I/O time: 1.0s, run time: 1.0s

LIWC, USER
64 unique features
64 out of 64, YOU
overall time: 10.0s -> I/O time: 6.0s, run time: 3.0s

LIWC, MESSAGE
64 unique features
64 out of 64, YOU
overall time: 7.0s -> I/O time: 3.0s, run time: 4.0s

TOPICS, COUNTY
2000 unique features
2000 out of 2000, 999
overall time: 68.0s -> I/O time: 32.0s, run time: 33.0s

TOPICS, USER
2000 unique features
2000 out of 2000, 999
overall time: 388.0s -> I/O time: 231.0s, run time: 107.0s

TOPICS, MESSAGE
2000 unique features
2000 out of 2000, 999
overall time: 182.0s -> I/O time: 59.0s, run time: 117.0s

1GRAM, COUNTY
45387 unique features
45387 out of 45387, ￣
overall time: 783.0s -> I/O time: 169.0s, run time: 593.0s

1GRAM, USER
13879 unique features
13879 out of 13879, ️
overall time: 513.0s -> I/O time: 98.0s, run time: 403.0s

1GRAM, MESSAGE
2322 unique features
2322 out of 2322, ️
overall time: 124.0s -> I/O time: 9.0s, run time

# LIWC , COUNTY

In [104]:
cliwc_tot, cliwc_ltot, cliwc_tp, cliwc_ltp = \
test_naiveBayes(cliwc_table_name, c_test_fold, c_label_prior_prob, cliwc_params, cliwc_defaults)

Total number of test cases: 872
872 out of 872 0.0
overall time: 77.0s -> I/O time: 0.0s, run time: 76.0s


# LIWC, USER

In [109]:
uliwc_tot, uliwc_ltot, uliwc_tp, uliwc_ltp = \
test_naiveBayes(uliwc_table_name, u_test_fold, u_label_prior_prob, uliwc_params, uliwc_defaults)

Total number of test cases: 9234
9233 out of 9234 0.05
overall time: 760.0s -> I/O time: 6.0s, run time: 753.0s


# LIWC, MESSAGE

In [111]:
mliwc_tot, mliwc_ltot, mliwc_tp, mliwc_ltp = \
test_naiveBayes(mliwc_table_name, m_test_fold, m_label_prior_prob, mliwc_params, mliwc_defaults)

Total number of test cases: 20000
19207 out of 20000 0.02
overall time: 440.0s -> I/O time: 5.0s, run time: 435.0s


# LIWC, Compare Results

In [155]:
print cliwc_tp, "out of", cliwc_tot
print uliwc_tp, "out of", uliwc_tot
print mliwc_tp, "out of", mliwc_tot
print '\n\n'
print cliwc_ltp, "out of", cliwc_ltot
print uliwc_ltp, "out of", uliwc_ltot
print mliwc_ltp, "out of", mliwc_ltot

{'zin_lognorm': 470, 'zin_norm': 443, 'zin_powerlaw': 434} out of 872
{'zin_lognorm': 5159, 'zin_norm': 5726, 'zin_powerlaw': 4323} out of 9233
{'zin_lognorm': 10377, 'zin_norm': 10234, 'zin_powerlaw': 10344} out of 19207



[{'zin_lognorm': 291, 'zin_norm': 260, 'zin_powerlaw': 81}, {'zin_lognorm': 179, 'zin_norm': 183, 'zin_powerlaw': 353}] out of [436, 436]
[{'zin_lognorm': 2071, 'zin_norm': 3139, 'zin_powerlaw': 144}, {'zin_lognorm': 3088, 'zin_norm': 2587, 'zin_powerlaw': 4179}] out of [4943, 4290]
[{'zin_lognorm': 7066, 'zin_norm': 6956, 'zin_powerlaw': 8891}, {'zin_lognorm': 3311, 'zin_norm': 3278, 'zin_powerlaw': 1453}] out of [10589, 8618]


# TOPIC, COUNTY

In [148]:
ctopic_tot, ctopic_ltot, ctopic_tp, ctopic_ltp = \
test_naiveBayes(ctopic_table_name, c_test_fold, c_label_prior_prob, ctopic_params, ctopic_defaults)

 Total number of test cases: 872
872 out of 872 2.57
overall time: 2344.0s -> I/O time: 21.0s, run time: 2323.0s


# TOPIC, USER

In [156]:
utopic_tot, utopic_ltot, utopic_tp, utopic_ltp = \
test_naiveBayes(utopic_table_name, u_test_fold, u_label_prior_prob, utopic_params, utopic_defaults)

Total number of test cases: 9234
9233 out of 9234 1.67
overall time: 22706.0s -> I/O time: 181.0s, run time: 22525.0s


# TOPIC, MESSAGE

In [154]:
mtopic_tot, mtopic_ltot, mtopic_tp, mtopic_ltp = \
test_naiveBayes(mtopic_table_name, m_test_fold, m_label_prior_prob, mtopic_params, mtopic_defaults)

Total number of test cases: 20000
19025 out of 20000 0.26
overall time: 6701.0s -> I/O time: 50.0s, run time: 6651.0s


# TOPIC, Compare Results

In [157]:
print ctopic_tp, "out of", ctopic_tot
print utopic_tp, "out of", utopic_tot
print mtopic_tp, "out of", mtopic_tot
print '\n\n'
print ctopic_ltp, "out of", ctopic_ltot
print utopic_ltp, "out of", utopic_ltot
print mtopic_ltp, "out of", mtopic_ltot

{'zin_lognorm': 473, 'zin_norm': 433, 'zin_powerlaw': 446} out of 872
{'zin_lognorm': 5334, 'zin_norm': 5972, 'zin_powerlaw': 4394} out of 9233
{'zin_lognorm': 10337, 'zin_norm': 10163, 'zin_powerlaw': 10295} out of 19025



[{'zin_lognorm': 306, 'zin_norm': 277, 'zin_powerlaw': 391}, {'zin_lognorm': 167, 'zin_norm': 156, 'zin_powerlaw': 55}] out of [436, 436]
[{'zin_lognorm': 2393, 'zin_norm': 3442, 'zin_powerlaw': 282}, {'zin_lognorm': 2941, 'zin_norm': 2530, 'zin_powerlaw': 4112}] out of [4943, 4290]
[{'zin_lognorm': 5813, 'zin_norm': 5573, 'zin_powerlaw': 5941}, {'zin_lognorm': 4524, 'zin_norm': 4590, 'zin_powerlaw': 4354}] out of [10486, 8539]


# 1GRAM, COUNTY

In [None]:
c1gram_tot, c1gram_ltot, c1gram_tp, c1gram_ltp = \
test_naiveBayes(c1gram_table_name, c_test_fold, c_label_prior_prob, c1gram_params, c1gram_defaults)

# 1GRAM, USER

In [None]:
u1gram_tot, u1gram_ltot, u1gram_tp, u1gram_ltp = \
test_naiveBayes(u1gram_table_name, u_test_fold, u_label_prior_prob, u1gram_params, u1gram_defaults)

# 1GRAM, MESSAGE

In [None]:
m1gram_tot, m1gram_ltot, m1gram_tp, m1gram_ltp = \
test_naiveBayes(m1gram_table_name, m_test_fold, m_label_prior_prob, m1gram_params, m1gram_defaults)

# 1GRAM, Compare Results

In [None]:
print c1gram_tp, "out of", c1gram_tot
print u1gram_tp, u1gram_tot
print m1gram_tp, m1gram_tot
print '\n\n'
print c1gram_ltp, c1gram_ltot
print u1gram_ltp, u1gram_ltot
print m1gram_ltp, m1gram_ltot