In [2]:
import unicodecsv 

#funcao para ler o csv
def read_csv(filename):
    with open(filename, 'rb') as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)

In [3]:
enrollments = read_csv('enrollments.csv')
daily_engagement = read_csv('daily_engagement.csv')
project_submissions = read_csv('project_submissions.csv')

In [4]:
enrollments[0]

{u'account_key': u'448',
 u'cancel_date': u'2015-01-14',
 u'days_to_cancel': u'65',
 u'is_canceled': u'True',
 u'is_udacity': u'True',
 u'join_date': u'2014-11-10',
 u'status': u'canceled'}

## Arrumando os tipos de dados

In [5]:
from datetime import datetime as dt

# Recebe uma data como string e returna em datetime 
# Se data for vazia retorna None
def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
# Recebe uma string que pode vir um inteiro,
# caso inteiro retorna o int, caso vazio retorna None
def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

# Arrumando os tipos de dados da tabela enrollment
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])
    
enrollments[0]

{u'account_key': u'448',
 u'cancel_date': datetime.datetime(2015, 1, 14, 0, 0),
 u'days_to_cancel': 65,
 u'is_canceled': True,
 u'is_udacity': True,
 u'join_date': datetime.datetime(2014, 11, 10, 0, 0),
 u'status': u'canceled'}

In [6]:
# unique_enrollment = set()
# for enrollment in enrollments:
#     unique_enrollment.add(enrollment['account_key'])
# Funçao que retorna um conjunto de chaves unicas para a contagem
def unique_keys(table, key):
    unique_set = set()
    for row in table:
        unique_set.add(row[key])
    return unique_set

enrollment_num_rows = len(enrollments)  
print(enrollment_num_rows)
enrollment_num_unique_students = len(unique_keys(enrollments, 'account_key'))
print(enrollment_num_unique_students)

daily_engagement_num_rows = len(daily_engagement)  
print(daily_engagement_num_rows)
daily_engagement_num_unique = len(unique_keys(daily_engagement, 'acct'))
print(daily_engagement_num_unique)

project_submissions_num_rows = len(project_submissions)  
print(project_submissions_num_rows)
unique_project_submitters = len(unique_keys(project_submissions, 'account_key'))
print(unique_project_submitters)

1640
1302
136240
1237
3642
743


In [7]:
#padronizando o nome da chave dos alunos
for engagement in daily_engagement:
    engagement['account_key'] = engagement['acct']
    del[engagement['acct']]

In [9]:
#funcao para retornar estudantes únicos
def get_unique_students(data):
    unique_students = set()
    for data_point in data:
        unique_students.add(data_point['account_key'])
    return unique_students

In [11]:
print(len(enrollments))
unique_enrolled_students = get_unique_students(enrollments)
print(len(unique_enrolled_students))
print(len(daily_engagement))
unique_engagement_students = get_unique_students(daily_engagement)
print(len(unique_engagement_students))
print(len(project_submissions))
unique_project_submitters = get_unique_students(project_submissions)
print(len(unique_project_submitters))

1640
1302
136240
1237
3642
743


In [12]:
#procurar porque alguns alunos inscritos não aparecem nos engajamentos
for enrollment in enrollments:
    student = enrollment['account_key']
    if student not in unique_engagement_students:
        print enrollment
        break

{u'status': u'canceled', u'is_udacity': False, u'is_canceled': True, u'join_date': datetime.datetime(2014, 11, 12, 0, 0), u'account_key': u'1219', u'cancel_date': datetime.datetime(2014, 11, 12, 0, 0), u'days_to_cancel': 0}


In [13]:
#identificando o numero de estudantes que ainda apresentam problemas(não estao no engajamento)
num_problems_students = 0
for enrollment in enrollments:
    student = enrollment['account_key']
    if student not in unique_engagement_students and enrollment['cancel_date'] != enrollment['join_date']:
        num_problems_students += 1
        print(enrollment)
num_problems_students

{u'status': u'canceled', u'is_udacity': True, u'is_canceled': True, u'join_date': datetime.datetime(2015, 1, 10, 0, 0), u'account_key': u'1304', u'cancel_date': datetime.datetime(2015, 3, 10, 0, 0), u'days_to_cancel': 59}
{u'status': u'canceled', u'is_udacity': True, u'is_canceled': True, u'join_date': datetime.datetime(2015, 3, 10, 0, 0), u'account_key': u'1304', u'cancel_date': datetime.datetime(2015, 6, 17, 0, 0), u'days_to_cancel': 99}
{u'status': u'current', u'is_udacity': True, u'is_canceled': False, u'join_date': datetime.datetime(2015, 2, 25, 0, 0), u'account_key': u'1101', u'cancel_date': None, u'days_to_cancel': None}


3

In [14]:
#os que possuem problemas, na verdade sao contas de teste, será contado a seguir quantas contas de teste existem
udacity_test_accounts = set()
for enrollment in enrollments:
    if enrollment['is_udacity']:
        udacity_test_accounts.add(enrollment['account_key'])
#variavel que possui as chaves das contas de teste
len(udacity_test_accounts)

6

In [15]:
#retirando contas de teste
def remove_udacity_accounts(data):
    non_udacity_data = []
    for data_point in data:
        if data_point['account_key'] not in udacity_test_accounts:
            non_udacity_data.append(data_point)
    return non_udacity_data

In [16]:
non_udacity_enrollments = remove_udacity_accounts(enrollments)
non_udacity_engagement = remove_udacity_accounts(daily_engagement)
non_udacity_submissions = remove_udacity_accounts(project_submissions)

print(len(non_udacity_enrollments))
print(len(non_udacity_engagement))
print(len(non_udacity_submissions))

1622
135656
3634


In [18]:
#retornar a lista de alunos que não cancelaram dentro de 7 dias
paid_students = {}
for enrollment in non_udacity_enrollments:
    if (not enrollment['is_canceled'] or enrollment['days_to_cancel']>7):
        account_key = enrollment['account_key']
        enrollment_date = enrollment['join_date']
        #para não repetir alunos, so vai ser incluido na lista caso não esteja na lista
        #ou se a data de cadastro for mais recente aí o registro na lista vai ser atualizado
        if (account_key not in paid_students or enrollment_date > paid_students[account_key]):
            paid_students[account_key] = enrollment_date
len(paid_students)

995