In [1]:
import networkx as nx
import numpy as np
import os
import codecs
import json
from collections import defaultdict as dd
from sklearn.externals import joblib

In [2]:
log_regression = joblib.load('data/log_regression/log_regression.pkl')

In [3]:
def get_user_network(path, uid):
    p = "%s/%d/user_network.txt"%(path, uid) # " path + str(uid) + '\user_network.txt'
    print p
    with codecs.open(p, 'r','utf-8') as inf:
        netw = json.load(inf)
    
    ids = netw.keys()
    for k, wrong_id in netw.items():
        wr_id = []
        for f in wrong_id:
            if str(f) not in ids:
                wr_id.append(f)
        netw[k] = filter(lambda x: x not in wr_id, netw[k])      
    return netw

In [4]:
#!ls data/1005299/
user_id = 1005299
list_of_attr = ['sex', 'city', 'country', 'graduation', 'university', 'school']

user_network = get_user_network("data", user_id)
friends = len(user_network.keys())


data/1005299/user_network.txt


In [5]:
def comparing_components(comp_1, comp_2):
    if comp_1 == None or comp_2 == None:
        return 0
    else:
        return int(comp_1 == comp_2)
    
def mutual_friends(m_fr):
    if friends <= 0:
        return 0
    else:
        return m_fr / float(friends)

def comparing(inf1, inf2, m_fr):
    res = []
    res.append(mutual_friends(m_fr))
    for i in range(1, len(list_of_attr) + 1):
        res.append(comparing_components(inf1[i], inf2[i]))
    res.append(res[5] and res[6])
    return res

def get_inf_from_file(path, uid, withbdate=False):
    p = "%s/%d/%s.txt"%(path, user_id, uid)
    #p = r'D:\University\Mishenin\ids\\' + str(user_id) + '\\' + str(id_us) + '.txt'
    with codecs.open(p, 'r','utf-8') as inf:
        inf_about_user = json.load(inf)
    inf_about_user = dd(lambda: None, inf_about_user)
    
    res_inf = []
    
    res_inf.append(inf_about_user['id'])
    
    if withbdate:
        res_inf.append(inf_about_user['bdate'])
    
    res_inf.append(inf_about_user['sex'])
    
    
    try:
        res_inf.append(inf_about_user['city']['id'])
    except:
        res_inf.append(None)
        
    try:
        res_inf.append(inf_about_user['country']['id'])
    except:
        res_inf.append(None)
        
    res_inf.append(inf_about_user['graduation'])
    res_inf.append(inf_about_user['university'])
    
    try:
        res_inf.append(int(inf_about_user['schools'][0]['id']))
    except:
        res_inf.append(None)
    
    return res_inf

def get_feature(id_1, id_2, m_fr):
    res_feat = []
    res_feat = comparing(get_inf_from_file("data", id_1), get_inf_from_file("data", id_2), m_fr)
    return res_feat

def find_w(id_1, id_2, m_fr):
    features = []
    features.append(get_feature(id_1, id_2, m_fr))
    return log_regression.predict_proba(features)[0][1]
   
def init_graph(u_network):
    graph = nx.Graph()
    all_friend = u_network.keys()
    for k, v in u_network.items():
        for t in v:
            e = 0
            for el in u_network[str(t)]:
                if el in v:
                    e += 1
            w = find_w(k, t, e)
            graph.add_edge(int(k),int(t), weight = w)
    return graph



In [6]:
s_graph = init_graph(user_network)

In [7]:
import numpy as np
import pandas as pd

import networkx as nx
import matplotlib.pyplot as plt

from sklearn.externals import joblib
from sklearn import linear_model, svm
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from scipy.special import expit

import datetime
import random

%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 6)

In [8]:
def calc_common_friends(G):
    common_friends = dict()
    for f, t in G.edges():
        if f < t:
            fr1 = set(G.neighbors(f))
            fr2 = set(G.neighbors(t))
            common_friends[(f, t)] = len(fr1 & fr2)
            
    return common_friends

In [9]:
G = s_graph
users = map(int, user_network.keys())
users = dict(zip(users, range(len(users))))

common_friends = calc_common_friends(G)  
G.adj

{197888: {971046: {'weight': 0.84649336322131941},
  1005299: {'weight': 0.83967570779626544}},
 227364: {1005299: {'weight': 0.83933698170521254}},
 299041: {1005299: {'weight': 0.84035142634566873},
  2607358: {'weight': 0.83933698170521254},
  4016004: {'weight': 0.83967570779626544},
  34075542: {'weight': 0.83967570779626544}},
 332243: {1005299: {'weight': 0.83967570779626544},
  4016004: {'weight': 0.83933698170521254}},
 476181: {925043: {'weight': 0.84681975504987272},
  1005299: {'weight': 0.84068841963106344},
  1125273: {'weight': 0.81081526023275452},
  2782004: {'weight': 0.83967570779626544},
  3800048: {'weight': 0.84681975504987272}},
 520950: {1005299: {'weight': 0.8416959405390102},
  2428140: {'weight': 0.84747083315103833},
  3403462: {'weight': 0.84136067630755462},
  4016004: {'weight': 0.84068841963106344},
  5036169: {'weight': 0.84714557820713032},
  9434575: {'weight': 0.84068841963106344},
  24308986: {'weight': 0.84811964024052355},
  213161330: {'weight': 

In [10]:
import datetime as dt
year = dt.datetime.now().year

def map_age(x):
    if x is not None:
        splited = x.split('.')
        if len(splited) == 3:
            return int(splited[2])
    return None

def read_demography(path, users):
   
    data = []
    for user in users:
        data.append(get_inf_from_file(path, int(user), True))
        
    
    df = pd.DataFrame(data)   
    df.columns = ['user', 'birth_date', 'sex', 'city', 'country', 'graduation', 'university', 'school']
    df = df[df['user'].map(lambda x: int(x) in users)]
    
    epoch = datetime.datetime.fromtimestamp(0)
    delta = datetime.timedelta(1)
    
    df['user'] = df['user'].map(lambda x: users[x])
    
    #df['birth_date'] = df['birth_date'].map(lambda x: (epoch + int(x) * delta) if x == x else np.nan)
    df['birth_year'] = df['birth_date'].map(lambda x: map_age(x))
    #df['birth_year'] = df[df['birth_date'].notnull()]['birth_date'].map(lambda x: x.year)
    df = df.set_index('user').sort_index()
    
    return df

In [11]:
print users
demography = read_demography('data', users)

{197888: 9, 40164416: 57, 1586562: 39, 8911083: 18, 4016004: 34, 1749654: 73, 5036169: 55, 11537965: 63, 1168272: 28, 9888600: 50, 228052500: 44, 476181: 33, 34075542: 32, 587799: 35, 1125273: 45, 145762159: 75, 114012190: 66, 222534854: 29, 299041: 60, 1578331: 62, 227364: 2, 132555462: 22, 971046: 7, 136128807: 21, 4476842: 59, 1800107: 15, 2428140: 74, 126226221: 20, 167515057: 11, 5357107: 38, 1302342: 31, 8345185: 64, 1005299: 53, 2782004: 36, 11401658: 41, 92661227: 71, 50931776: 4, 147962315: 65, 3403462: 6, 106098375: 16, 1680584: 40, 28431945: 54, 2801483: 30, 96905651: 70, 9434575: 72, 2175192: 69, 332243: 49, 5169109: 61, 4303961: 42, 253195991: 14, 120309336: 12, 84304932: 23, 161946971: 13, 13490396: 26, 3855711: 47, 1545744: 43, 793191: 68, 3898216: 27, 914538: 5, 46413419: 17, 835692: 46, 161381997: 10, 79387118: 3, 1371119: 52, 3800048: 56, 213161330: 58, 925043: 37, 10739700: 1, 520950: 8, 2110199: 24, 202150776: 67, 24308986: 0, 28822267: 48, 28924156: 51, 1419261: 19

In [12]:
def make_weighter(G):
    d = G.adj
    def weighter(edge):
        f,t = edge
        return d[f][t]['weight']   
    return weighter 

In [13]:
weighter = make_weighter(G)

In [14]:
def prepare_tests(num, prct, demography):
    tests = []
    for i in range(num):
        nodes_with_age = list(demography.index)
        random.shuffle(nodes_with_age)

        pivot = int(len(nodes_with_age) * prct)
        labeled_nodes = nodes_with_age[:pivot]
        test_nodes = nodes_with_age[pivot:]
        
        tests.append((labeled_nodes, test_nodes))
        
    return tests

tests = prepare_tests(10, 0.4, demography)

In [72]:
filter(lambda x: not np.isnan(x), [np.nan, 1, 2])

[1, 2]

In [131]:
import pulp

class LinearProgrammingSolver(object):
    def __init__(self, G, demography, labeled_nodes, 
                 weighter):
        self.G = G
        self.demography = demography
        self.labeled_nodes = labeled_nodes
        self.weighter = weighter
        
        years = sorted(set(demography.ix[labeled_nodes, 'birth_year']))
        
        years = filter(lambda x: not np.isnan(x), years)
        print years
        self.year_to_id = dict(zip(years, range(len(years))))
        self.id_to_year = dict(enumerate(years))
        
    def prepare_lp(self):
        G = self.G
        year_to_id = self.year_to_id  
        demography = self.demography
        
        labeled_nodes_set = set(self.labeled_nodes)
        
        num_labels = len(year_to_id)
        num_edges = G.number_of_edges()
        num_nodes = G.number_of_nodes()
                
        prob = pulp.LpProblem("Labelled graph problem", pulp.LpMinimize)    
        
        x_pa = pulp.LpVariable.dicts('x_pa', [(node, label) for node in range(num_nodes) 
                                            for label in range(num_labels)],                                  
                                  0., 1.)
        
        z_e = pulp.LpVariable.dicts('z_e', [edge for edge in range(num_edges)])
        z_ea = pulp.LpVariable.dicts('z_ea', [(edge, label) for edge in range(num_edges) 
                                                for label in range(num_labels)])
        
        condition = 0
        for node in G.nodes():
            node = users[node]
            #print labeled_nodes_set
            if node in labeled_nodes_set:
                year = demography.ix[node]['birth_year'] 
                if np.isnan(year):
                    condition += pulp.lpSum([x_pa[(node,label)] for label in range(num_labels)])
                else:
                    condition += pulp.lpSum([x_pa[(node, label)] * np.abs(year - self.id_to_year[label]) for label in range(num_labels)]) 
                                              
        
        for edge, (f, t, data) in enumerate(G.edges(data=True)):
            #w = data['weight']
            if (f in labeled_nodes_set) and (t in labeled_nodes_set):
                (year_f, year_t) = demography.ix[f]['birth_year'], demography.ix[t]['birth_year']
                diff = np.abs(year_f - year_t)
                condition += z_e[edge] * ( 10. if diff < 4 else np.max(1. - diff / 10., 0.2) )
            else:
                condition += z_e[edge] * self.weighter((f, t)) 
        
        prob += condition 
        for node in G.nodes():
            prob += (pulp.lpSum([x_pa[(users[node], label)] for label in range(num_labels)]) == 1.)
                
        for edge, (f, t) in enumerate(G.edges()):
            prob += (0.5 * pulp.lpSum([z_ea[(edge, label)] for label in range(num_labels)]) == z_e[edge])
            
            f = users[f]
            t = users[t]
            for label in range(num_labels):
                prob += (z_ea[(edge, label)] >= x_pa[(f, label)] - x_pa[(t, label)])
                prob += (z_ea[(edge, label)] >= x_pa[(t, label)] - x_pa[(f, label)])
        
        
        return prob 
    #prob.writeLP("problem.lp")

In [132]:
LPS = LinearProgrammingSolver(G, demography, list(demography.index), weighter)
#

[1986.0, 1953.0, 1957.0, 1970.0, 1974.0, 1975.0, 1976.0, 1977.0, 1978.0, 1979.0, 1980.0, 1981.0, 1982.0, 1983.0, 1984.0, 1985.0, 1987.0, 1988.0, 1989.0, 1991.0, 1996.0]


In [133]:
LP = LPS.prepare_lp()

In [134]:
LP.solve(pulp.solvers.COIN_CMD())
users_ = dict(zip(users.values(), users.keys()))

In [135]:
u_p = dd(lambda: 0)
u_y = {}
for v in LP.variables():
    val = pulp.value(v)
    if np.isnan(val):
        continue
    v = str(v)
    if v[0] != 'x':
        continue
    uid_, yid = v[6:-1].split(',')
    uid_ = users_[int(uid_)]
    yid = LPS.id_to_year[int(yid[1:])]
    
    if u_p[uid_] < val:
        u_p[uid_] = val
        u_y[uid_] = yid 

In [136]:
print u_p
print u_y


defaultdict(<function <lambda> at 0x7f710ea015f0>, {197888: 1.0, 40164416: 1.0, 1586562: 1.0, 8911083: 1.0, 4016004: 1.0, 1749654: 1.0, 5036169: 1.0, 11537965: 1.0, 1168272: 1.0, 9888600: 1.0, 228052500: 1.0, 476181: 1.0, 34075542: 1.0, 587799: 1.0, 1125273: 1.0, 145762159: 1.0, 114012190: 1.0, 299041: 1.0, 1578331: 1.0, 227364: 1.0, 222534854: 1.0, 971046: 1.0, 136128807: 1.0, 4476842: 1.0, 1800107: 1.0, 2428140: 1.0, 126226221: 1.0, 3403462: 1.0, 167515057: 1.0, 5357107: 1.0, 1302342: 1.0, 8345185: 1.0, 1005299: 1.0, 2782004: 1.0, 11401658: 1.0, 92661227: 1.0, 50931776: 1.0, 147962315: 1.0, 132555462: 1.0, 106098375: 1.0, 1680584: 1.0, 28431945: 1.0, 2801483: 1.0, 96905651: 1.0, 9434575: 1.0, 2175192: 1.0, 332243: 1.0, 5169109: 1.0, 4303961: 1.0, 253195991: 1.0, 120309336: 1.0, 84304932: 1.0, 161946971: 1.0, 13490396: 1.0, 3855711: 1.0, 1545744: 1.0, 793191: 1.0, 3898216: 1.0, 914538: 1.0, 46413419: 1.0, 835692: 1.0, 161381997: 1.0, 79387118: 1.0, 1371119: 1.0, 3800048: 1.0, 21316133

In [92]:
LP.coefficients

<bound method LpProblem.coefficients of Labelled graph problem:
MINIMIZE
0.839675707796*z_e_0 + 0.846493363221*z_e_1 + 0.848119640241*z_e_10 + 0.834967721897*z_e_100 + 0.849410456228*z_e_101 + 0.848766180021*z_e_102 + 0.848443193312*z_e_103 + 0.843031246659*z_e_104 + 0.842364742906*z_e_105 + 0.842615833176*z_e_106 + 0.839675707796*z_e_107 + 0.839675707796*z_e_108 + 0.840688419631*z_e_109 + 0.841024836145*z_e_11 + 0.841613065194*z_e_110 + 0.85406555855*z_e_111 + 0.847145578207*z_e_112 + 0.847145578207*z_e_113 + 0.847470833151*z_e_114 + 0.827077276561*z_e_115 + 0.847795520342*z_e_116 + 0.849088600837*z_e_117 + 0.848443193312*z_e_118 + 0.839541102776*z_e_119 + 0.847795520342*z_e_12 + 0.841024836145*z_e_120 + 0.839675707796*z_e_121 + 0.833925828537*z_e_122 + 0.841613065194*z_e_123 + 0.839675707796*z_e_124 + 0.848766180021*z_e_125 + 0.848119640241*z_e_126 + 0.840013855872*z_e_127 + 0.842030629263*z_e_128 + 0.841695940539*z_e_129 + 0.849731746667*z_e_13 + 0.842282152084*z_e_130 + 0.833577362