In [6]:
import pandas as pd

users = [
    {'id':0, 'name':'Hero'},
    {'id':1, 'name':'Dunn'},
    {'id':2, 'name':'Sue'},
    {'id':3, 'name':'Chi'},
    {'id':4, 'name':'Thor'},
    {'id':5, 'name':'Clive'},
    {'id':6, 'name':'Hicks'},
    {'id':7, 'name':'Devin'},
    {'id':8, 'name':'Kate'},
    {'id':9, 'name':'Klein'},
]

friendships = [(0,1),(0,2),(1,2),(1,3),(2,3),(3,4),
              (4,5),(5,6),(5,7),(6,8),(7,8),(8,9),]

In [7]:
for user in users:
    user['friends'] = []

for i, j in friendships:
    # user [i] is the user whose id is i (i,j) in friendships list tuple
    users[i]['friends'].append(users[j]) # add i as a friend of j
    users[j]['friends'].append(users[i]) # add j as a friend of i

#### What's the average number of connections?

In [8]:
#length of friends_ids list

def number_of_friends(user):
    #how many friends does the user have?
    return len(user['friends'])

total_connections = sum(number_of_friends(user) for user in users)

In [19]:
from __future__ import division

num_users = len(users)
avg_connections = total_connections / num_users

24


In [20]:
#sort the users from 'most friends' to 'least friends'

# create a list (user_id, number_of_friends)
num_friends_by_id = [(user['id'], number_of_friends(user)) for user in users]
#network metric degree centrality
print(num_friends_by_id)

[(0, 2), (1, 3), (2, 3), (3, 3), (4, 2), (5, 3), (6, 2), (7, 2), (8, 3), (9, 1)]


#### Data Scientists You May Know

In [24]:
#foaf is short for 'friend of a friend'
def friends_of_friend(user):
    return [foaf['id']
            for friend in user['friends'] # for each of user's friends
            for foaf in friend['friends']] # for each of their friends

#user Hero friends 
print(friends_of_friend(users[0]))
# dá uma lista do id os amigos dos seus amigos, neste caso o user 0 tem como amigo o user 1 e o user 2
# sendo assim, obtemos uma lista com os amigos desses, incluindo ele proprio
# user 0 tem user 1 e user 2 amigos
# user 1 tem 0, 2, 3
# user 2 tem 0, 1, 3
print ([friend['id'] for friend in users[0]['friends']])
print ([friend['id'] for friend in users[1]['friends']])
print ([friend['id'] for friend in users[2]['friends']])


[0, 2, 3, 0, 1, 3]
[1, 2]
[0, 2, 3]
[0, 1, 3]


In [28]:
from collections import Counter
# Sabendo que as pessoas sao amigos dos amigos em várias formas podemos produzir um contador de amigos mutuos
# E devemos usar uma função de ajuda para excluir os users que já conhece

def not_the_same(user, other_user):
    # 2 users não são iguais se tiverem diferentes id's
    return user['id'] != other_user['id']

def not_friends(user, other_user):
    # o other_user não é amigo se não estiver na user['friends']
    # ou seja, se not_the_same em todas as pessoas em user['friends']
    return all(not_the_same(friend,other_user)
               for friend in user['friends'])

def friends_of_friends_ids(user):
    return Counter(foaf['id']
                for friend in user['friends'] # para cada um dos meus amigos
                for foaf in friend['friends'] # contar os amigos deles
                if not_the_same(user, foaf) # que nao sao eu
                and not_friends(user, foaf)) # nem meus amigos

print(friends_of_friends_ids(users[0]))
# Counter({3: 2}) , o user 0 , não é amigo do user 3, porém o user 3 tem 2 amigos que o user 0 tem.

Counter({3: 2})


#### Substantive expertise

In [33]:
interests = [
    (0, 'Haddop'),(0, 'Big Data'),(0, 'HBASE'),(0, 'Java'),
    (0, 'Spark'),(0, 'Storm'),(0, 'Cassandra'),
    (1, 'NoSQL'),(1, 'MongoDB'),(1, 'Cassandra'),(1, 'HBase'),
    (1, 'Postgres'),(2, 'Python'),(2, 'sikit-learn'),(2, 'scipy'),
    (2, 'numpy'),(2, 'statsmodels'),(2, 'pandas'),(3, 'R'),
    (3, 'Python'),(3, 'statistics'),(3, 'regression'),(3, 'probability'),
    (4, 'machine learning'),(4, 'regression'),(4, 'decision trees'),(4, 'libsvm'),
    (5, 'Python'),(5, 'R'),(5, 'Java'),(5, 'C++'),
    (4, 'Haskell'),(5, 'programming languages'),(6, 'statistics'),
    (6, 'probability'),(6, 'mathematics'),(6, 'theory'),
    (7, 'machin learning'),(7, 'sikit-learn'),(7, 'Mahout'),
    (7, 'neural networks'),(8, 'neural networks'),(8, 'deep learning'),
    (8, 'Big Data'),(8, 'artificial intelligence'),(9, 'Hadoop'),
    (9, 'Java'),(9, 'MapReduce'),(9, 'Big Data')    
]
# encontrar users com os meus interesses
def data_scientists_who_like(target_interest):
    return [user_id
            for user_id, user_interest in interests
            if user_interest == target_interest]

# criar um index dos interesses dos users
from collections import defaultdict
# as keys sao os interests e os values sao os user_ids com esses interests
user_ids_by_interest = defaultdict(list)
for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)

print(user_ids_by_interest)

print('************************')
# e outra com os user_ids e seus interesses
interests_by_user_id = defaultdict(list)
for user_id, interest in interests:
    interests_by_user_id[user_id].append(interest)

print(interests_by_user_id)

defaultdict(<class 'list'>, {'Haddop': [0], 'Big Data': [0, 8, 9], 'HBASE': [0], 'Java': [0, 5, 9], 'Spark': [0], 'Storm': [0], 'Cassandra': [0, 1], 'NoSQL': [1], 'MongoDB': [1], 'HBase': [1], 'Postgres': [1], 'Python': [2, 3, 5], 'sikit-learn': [2, 7], 'scipy': [2], 'numpy': [2], 'statsmodels': [2], 'pandas': [2], 'R': [3, 5], 'statistics': [3, 6], 'regression': [3, 4], 'probability': [3, 6], 'machine learning': [4], 'decision trees': [4], 'libsvm': [4], 'C++': [5], 'Haskell': [4], 'programming languages': [5], 'mathematics': [6], 'theory': [6], 'machin learning': [7], 'Mahout': [7], 'neural networks': [7, 8], 'deep learning': [8], 'artificial intelligence': [8], 'Hadoop': [9], 'MapReduce': [9]})
************************
defaultdict(<class 'list'>, {0: ['Haddop', 'Big Data', 'HBASE', 'Java', 'Spark', 'Storm', 'Cassandra'], 1: ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres'], 2: ['Python', 'sikit-learn', 'scipy', 'numpy', 'statsmodels', 'pandas'], 3: ['R', 'Python', 'statistics',

#### Encontrar quem tem os mesmos interesses que um dado user:

In [39]:
# Iterar os interesses do user
# Para cada interesse, iterar sobre o outro user interesse
# Contar quantas vezes nos vemos um user

def most_common_interests_with(user):
    return Counter(interested_user_id
                   for interest in interests_by_user_id[user['id']]
                   for interested_user_id in user_ids_by_interest[interest]
                   if interested_user_id != user['id'])

most_common_interests_with(user)

Counter({0: 2, 5: 1, 8: 1})

#### Salaries and Experience

In [54]:
salaries_and_tenures = [(83000, 8.7), (88000, 8.1),
                        (48000, 0.7), (76000, 6),
                        (69000, 6.5), (76000, 7.5),
                        (60000, 2.5), (83000, 10),
                        (48000, 1.9), (63000, 4.2)]

# keys sao anos, values sao uma lista de salarios 
salary_by_tenure = defaultdict(list)
for salary, tenure in salaries_and_tenures:
    salary_by_tenure[tenure].append(salary)
# em função dos anos de serviço
print(salary_by_tenure)

# keys são os anos, e os valores são a média do salario por anos de serviço
average_salary_by_tenure = {
    tenure : sum(salaries) / len(salaries)
    for tenure, salaries in salary_by_tenure.items()
}
print('***************************')
print(average_salary_by_tenure)

defaultdict(<class 'list'>, {8.7: [83000], 8.1: [88000], 0.7: [48000], 6: [76000], 6.5: [69000], 7.5: [76000], 2.5: [60000], 10: [83000], 1.9: [48000], 4.2: [63000]})
***************************
{8.7: 83000.0, 8.1: 88000.0, 0.7: 48000.0, 6: 76000.0, 6.5: 69000.0, 7.5: 76000.0, 2.5: 60000.0, 10: 83000.0, 1.9: 48000.0, 4.2: 63000.0}


In [57]:
def tenure_bucket(tenure):
    if tenure <2:
        return 'less than two'
    elif tenure <5:
        return 'between two and five'
    else: 
        return 'more than five'

# Agrupar salário no corespondente bucket:
salary_by_tenure_bucket = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    bucket = tenure_bucket(tenure)
    salary_by_tenure_bucket[bucket].append(salary)

# Compute o salario medio para cada grupo
avarage_salary_by_bucket = {
    tenure_bucket : sum(salaries) / len(salaries)
    for tenure_bucket, salaries in salary_by_tenure_bucket.items()
}
print(avarage_salary_by_bucket)

# 'Data scientists with more than five years experience earn 65% more than scientists with little or no experience!'


{'more than five': 79166.66666666667, 'less than two': 48000.0, 'between two and five': 61500.0}


#### Interesses mais populares

In [58]:
print(interests)

[(0, 'Haddop'), (0, 'Big Data'), (0, 'HBASE'), (0, 'Java'), (0, 'Spark'), (0, 'Storm'), (0, 'Cassandra'), (1, 'NoSQL'), (1, 'MongoDB'), (1, 'Cassandra'), (1, 'HBase'), (1, 'Postgres'), (2, 'Python'), (2, 'sikit-learn'), (2, 'scipy'), (2, 'numpy'), (2, 'statsmodels'), (2, 'pandas'), (3, 'R'), (3, 'Python'), (3, 'statistics'), (3, 'regression'), (3, 'probability'), (4, 'machine learning'), (4, 'regression'), (4, 'decision trees'), (4, 'libsvm'), (5, 'Python'), (5, 'R'), (5, 'Java'), (5, 'C++'), (4, 'Haskell'), (5, 'programming languages'), (6, 'statistics'), (6, 'probability'), (6, 'mathematics'), (6, 'theory'), (7, 'machin learning'), (7, 'sikit-learn'), (7, 'Mahout'), (7, 'neural networks'), (8, 'neural networks'), (8, 'deep learning'), (8, 'Big Data'), (8, 'artificial intelligence'), (9, 'Hadoop'), (9, 'Java'), (9, 'MapReduce'), (9, 'Big Data')]


In [60]:
# colocar todos os interesses em letra miniuscula para os comparar
# Dividir em palavras
# contar o resultado

words_and_counts = Counter(word
                           for user, interest in interests
                           for word in interest.lower().split())
# criar uma lista com as palavras mais comuns
for word, count in words_and_counts.most_common():
    if count > 1:
        print(word, count)

big 3
data 3
java 3
python 3
learning 3
hbase 2
cassandra 2
sikit-learn 2
r 2
statistics 2
regression 2
probability 2
neural 2
networks 2
