In [1]:
from piazza_api import Piazza
from functools import reduce
from os import environ
import json
import pandas as pd

# Piazza - API Crawling

In [5]:
def _extract_ids(course):
    courses = []
    
    keys= [("cn", "course_cmu_id"),
    ("n", "name")]
    
    new_course = {}
    terms = course.get('terms', {})
    for key, new_key in keys:
        new_course[new_key] = course.get(key)
    
    keys= [("cnt", "cnt"),
    ("id","course_piazza_id"),
    ("prof", "prof")]
    
    terms = course.get('terms', {})
    for term in terms:
        features = new_course.copy()
        tmp = terms[term]
        
        features['term'] = term
        
        if 'id' not in tmp:
            print('Skipped', features)
            continue
    
        for key, new_key in keys:
            features[new_key] = tmp.get(key)
        
        courses.append(features)

    return courses

def get_piazza_ids(filepath='./data/courses_ids.json'):
    courses_ids = []
    with open(filepath, 'r', encoding='utf-8') as f:
        courses_ids = json.load(f)

    extracted = map(_extract_ids, courses_ids)
    extracted_flat = reduce(lambda a, b: a+b, extracted)
    return extracted_flat

In [3]:
def _extract_stats(stats):
    features = {}

    # Total
    tmp = stats['total']
    
    keys= [("posts", "stats_posts"),
    ("questions", "stats_questions"),
    ("i_answers", "stats_i_answers"),
    ("s_answers", "stats_s_answers"),
    ("net_time", "stats_net_time"),
    ("response_time", "stats_response_time")]
    
    for key, new_key in keys:
        features[new_key] = tmp.get(key)
    
    
    keys = [("user_id", "top_user_%d_user_id"),
    ("days", "top_user_%d_days"),
    ("posts", "top_user_%d_posts"),
    ("asks", "top_user_%d_asks"),
    ("answers", "top_user_%d_answers"),
    ("views", "top_user_%d_views")]
    
    tmp = stats.get('top_users',[])
    for i in range(min(3, len(tmp))):
        top_user = tmp[i]
        for key, new_key in keys:
            features[new_key % i] = top_user.get(key)
    
    return features

def get_piazza_stats(courses, 
                     out_filename=None, 
                     user=environ.get('PIAZZA_USER'),
                     pwd=environ.get('PIAZZA_PWD')):
    p = Piazza()
    p.user_login(user,pwd)
    
    courses_data = []
    total = len(courses)
    for i, c_id in enumerate(courses):
        print('%d/%d' % (i, total))
        course = p.network(c_id)
        stats = course.get_statistics()
        features = _extract_stats(stats)
        courses_data.append(features)
        features['course_piazza_id'] = c_id
    df = pd.DataFrame(courses_data) 
    if out_filename:
        df.to_csv(out_filename, index=False)
    return df

In [12]:
def main(in_filename='./data/courses_ids.json', out_filename='./data/piazza.csv'):
    data = get_piazza_ids(filepath=in_filename)
    df_details = pd.DataFrame(data)
    df_stats = get_piazza_stats(df.course_piazza_id.values)
    df = df_details.merge(df_stats, on='course_piazza_id')
    df.to_csv(out_filename, index=False)
    return df

# FCE - Web Crawling

In [18]:
# PENDING #