In [1]:
import MySQLdb as mdb
import sys
import pandas as pd


# The connect() method has four parameters. 
# The first parameter is the host, where the MySQL database is located. 
# In our case it is a localhost, e.g. our computer. 
# The second parameter is the database user name. 
# It is followed by the user's account password. 
# The final parameter is the database name.
host = '173.194.242.182'
username = 'programize'
password = 'scholar123!'
database = 'citation_analysis_db'



In [2]:
def create_table():
    table_name = 'h_index'
    create_table_query = '''CREATE TABLE IF NOT EXISTS {0}.{1}
                                    (author_id int, 
                                    year int, 
                                    h_index int,
                                    PRIMARY KEY(author_id, year)
                                    )'''.format(database, table_name)

    con = mdb.connect(host, username, password, database, 
                    charset='utf8', use_unicode=True);
    cursor = con.cursor()
    cursor.execute(create_table_query)
    cursor.close()
    con.close()

In [None]:
def fetch_author_data_from_db(authorid):
    con = mdb.connect(host, username, password, database, 
                charset='utf8', use_unicode=True);

    sql_query_template = '''SELECT P.publication_id, Y.year, Y.citations
                            FROM author A INNER JOIN 
                                 author_publication P ON A.id=P.author_id INNER JOIN
                                 publication_citations_per_year Y ON P.publication_id = Y.publication_id
                            WHERE id = {0}
                            ORDER BY A.id, Y.year'''
    sql_query = sql_query_template.format(authorid)
    
    cur = con.cursor(mdb.cursors.DictCursor)
    cur.execute(sql_query)
    rows = cur.fetchall()
    cur.close()
    con.close()
    
    

    return rows

In [None]:
def get_author_data(authorid):
    rows = fetch_author_data_from_db(authorid)
    if len(rows)==0:
        return None
    df = pd.DataFrame(list(rows)).pivot(index='year', columns='publication_id', values='citations').cumsum().fillna(method='ffill').T
    return df;

In [None]:
def get_h_index(df, year, min_h):
    citations = df[year]
    h = min_h
    while True:
        num_pubs = len(sorted(citations[citations>=h]))
        if num_pubs>=h:
            h = h+1
        else:
            break
    return h-1

In [None]:
def get_h_index_history(df):
    result = []
    h_index = 0
    years = sorted(set(df.columns.values))
    for year in years:
        h_year =  get_h_index(df, year, h_index)
        h_index = h_year
        result.append(  {'year': year, 'h-index': h_year } )
    return result

In [None]:
def insert_line_into_db(con, database, table_name, authorid, year, h_index):
    insert_query_template = '''INSERT IGNORE INTO {db}.{table}(author_id, year, h_index) VALUES ({author_id}, {year_value}, {h_index_value})'''
    insert_query = insert_query_template.format(db=database, table=table_name, author_id=authorid, year_value=year, h_index_value=h_index) 
    cursor = con.cursor()
    cursor.execute(insert_query)
    cursor.close()


In [None]:
def insert_author_into_db(authorid):
    con = mdb.connect(host, username, password, database, 
                    charset='utf8', use_unicode=True)
    table_name = 'h_index'
    df = get_author_data(authorid)
    if df is None:
        return
    results = get_h_index_history(df)
    for row in results:
        insert_line_into_db(con, database, table_name, authorid, row["year"], row["h-index"])
    con.commit()
    con.close()  

In [None]:
def get_author_ids():
    con = mdb.connect(host, username, password, database, 
                charset='utf8', use_unicode=True);

    sql_query_template = '''SELECT id FROM citation_analysis_db.author WHERE scholar_id IS NOT NULL AND organization_id IS NOT NULL AND id NOT IN (SELECT author_id FROM h_index)'''
    sql_query = sql_query_template
    
    cur = con.cursor(mdb.cursors.DictCursor)
    cur.execute(sql_query)
    rows = cur.fetchall()
    cur.close()
    con.close()
    
    return sorted([row["id"] for row in rows])

In [None]:
author_ids = get_author_ids()
print(len(author_ids))
for authorid in author_ids:
    print(authorid)
    insert_author_into_db(authorid)
    