In [1]:
from keras.models import load_model
from keras import regularizers
import pandas as pd
import numpy as np
from keras.models import Sequential, Model
from keras.layers import Embedding, Reshape, Activation, Input, Dense, Flatten, Dropout
from keras.layers.merge import Dot, multiply, concatenate
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import skipgrams
from collections import defaultdict
import pandas as pd
from sklearn.model_selection import train_test_split
import numba as nb


Using TensorFlow backend.


In [2]:
class read_train_model():
    
    def __init__(self,data_path):
        self.data_path=data_path
        self.df_in=pd.DataFrame()
        self.df_train=pd.DataFrame()
        self.df_test=pd.DataFrame()
        self.n_user=0
        self.n_posts=0
        
    def read_data(self):
        self.df_in = pd.read_csv(self.data_path)
        self.df_in.loc[self.df_in.Likes>0,'Likes']=1
        self.df_in.loc[self.df_in.Comments>0,'Comments']=1
        self.df_in.loc[self.df_in.Shares>0,'Shares']=1
        self.df_in.loc[self.df_in.Downloads>0,'Downloads']=1
        self.df_in.loc[self.df_in.Views>0,'Views']=1

        self.df_in['Rating']=self.df_in['Likes']+self.df_in['Comments']+self.df_in['Shares']+self.df_in['Downloads']+self.df_in['Views']
        self.df_in.drop(['Likes','Comments','Shares','Downloads','Views'],axis=1,inplace=True)
    
        self.df_in.UserId = self.df_in.UserId.astype('category').cat.codes.values
        self.df_in.PostId = self.df_in.PostId.astype('category').cat.codes.values

        self.df_train,self.df_test =train_test_split(self.df_in, test_size = 0.1,random_state = 42 )

        self.n_users = len(self.df_in.UserId.unique()) 
        self.n_posts = len(self.df_in.PostId.unique())
        
    
    
    def user_post_ids(self):
        dff = pd.read_csv(self.data_path)
        dff.loc[dff.Likes>0,'Likes']=1
        dff.loc[dff.Comments>0,'Comments']=1
        dff.loc[dff.Shares>0,'Shares']=1
        dff.loc[dff.Downloads>0,'Downloads']=1
        dff.loc[dff.Views>0,'Views']=1

        dff['Rating']=dff['Likes']+dff['Comments']+dff['Shares']+dff['Downloads']+dff['Views']
        dff.drop(['Likes','Comments','Shares','Downloads','Views'],axis=1,inplace=True)
        trainn,testt = train_test_split(dff,test_size = 0.1,random_state =42)
        
        return trainn, testt


    def define_model(self):
        post_input = Input(shape=[1], name="post-Input")
        post_embedding = Embedding(self.n_posts+1,10,  name="post-Embedding")(post_input)
        lp = Dense(10,activation = 'relu',kernel_regularizer=regularizers.l2(0.001),)(post_embedding)
        Dropout(0.4)
        post_vec = Flatten(name="Flatten-post")(lp)

        user_input = Input(shape=[1], name="User-Input")
        user_embedding = Embedding(self.n_users+1, 10, name="User-Embedding")(user_input)
        l2 = Dense(10,activation = 'relu',kernel_regularizer=regularizers.l2(0.001))(user_embedding)
        Dropout(0.4)
        user_vec = Flatten(name="Flatten-Users")(l2)

        product_layer = Dot(name="Dot-Product", axes=1)([post_vec, user_vec])

        fully_connected_layer = Dense(10,activation ='relu')(product_layer)
        fully_connected_layer_2 = Dense(10,activation ='relu')(fully_connected_layer)
        fully_connected_layer_3 = Dense(10,activation ='relu')(fully_connected_layer_2)
        fully_connected_layer_4 = Dense(10,activation ='relu')(fully_connected_layer_3)


        output_connected_layer = Dense(1,activation ='linear')(fully_connected_layer_4)

        model = Model([user_input, post_input],output_connected_layer)
        model.compile(loss='mse', optimizer='adam', metrics=["mae"])
        return model
    
    def train_model(self):
        model =self.define_model()
        history = model.fit([self.df_train.UserId, self.df_train.PostId], self.df_train.Rating,validation_split=0.1 , epochs= 1, verbose=1)
        model.save('recommender_model.h5')
        return history
    
    def get_model(self):
        model = load_model('recommender_model.h5')
        print('model loaded')
        return model
    
    
    def get_estimation_data(self):
        def duplicate(testList,n ): 
            return list(testList*n)
                
        n_users,n_posts,train,test=self.n_user,self.n_posts,self.df_train,self.df_test
        trainn,testt=self.user_post_ids()
        len_post = len(test.PostId.unique())
        len_user= len(testt.UserId.unique())
        p = test.PostId.unique()
        unique_postids = p.tolist()
        upids=duplicate(unique_postids,len_user) #post_ids_looped


        u =test.UserId.unique()
        unique_userids =u.tolist()
        un = np.array(unique_userids)
        user_loop =np.repeat(unique_userids,len_post) #user_ids_looped
        ttpids = testt['PostId'].unique()
        ttuid = testt['UserId'].unique()
        pp = testt.PostId.unique()
        uunique_postids = pp.tolist()
        uupids=duplicate(uunique_postids,len_user) #post_ids_looped


        uu =testt.UserId.unique()
        uunique_userids =uu.tolist()
        uun = np.array(uunique_userids)
        uuser_loop =np.repeat(uunique_userids,len_post) #user_ids_looped
        post_data = np.array(upids)
        user = np.array(user_loop)
        model=self.get_model()
        estimations = model.predict([user, post_data]) #predictions
   
        pid =pd.DataFrame(uupids)  #forming dataframes
        uid =pd.DataFrame(uuser_loop)
        estimation =pd.DataFrame(estimations)
        dataa = pd.merge(estimation,pid,left_index =True,right_index = True)
        data = pd.merge(dataa,uid,left_index = True, right_index= True)
        data.rename(columns={'0_x':'estimation','0_y':'post_id',0:'user_id'},inplace = True)
        final_data_sorted = data.groupby(["user_id"]).apply(lambda x: x.sort_values(["estimation"], ascending = False)).reset_index(drop=True)
        return final_data_sorted



In [3]:
rec_model=read_train_model('/home/gaurav/Desktop/RecEng/Wall_Activity_User_Post.csv')
rec_model.read_data()
rec_model.define_model()
rec_model.train_model()


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 1108501 samples, validate on 123167 samples
Epoch 1/1


<keras.callbacks.History at 0x7f1a9549bf98>

In [None]:
import numba as nb
posts=rec_model.df_in['PostId'].unique()
users=rec_model.df_in['UserId'].unique()
final_df=pd.DataFrame(index=users)
model=rec_model.get_model()


i=1

for pid in posts:
    rec_posts_uid=[]
    post=np.full(shape=len(users),fill_value=pid)
    est=model.predict([users,post])
    final_df[pid]=est
    print(str(i)+':'+str(pid),end=',')
    i+=1


In [5]:
import time
posts=rec_model.df_in['PostId'].unique()
users=rec_model.df_in['UserId'].unique()
final_df=pd.DataFrame(index=users)
model=rec_model.get_model()


i=1
start = time.time()
for pid in posts:
    rec_posts_uid=[]
    post=np.full(shape=len(users),fill_value=pid)
    est=model.predict([users,post])
    final_df[pid]=est
    print(str(i)+':'+str(pid),end=',')
    i+=1
print("Complete")
end = time.time()
print('total time (s)= ' + str(end-start))

model loaded


1:0,2:1,3:3,4:4,5:5,6:6,7:7,8:8,9:9,10:10,11:11,12:12,13:13,14:14,15:15,16:16,17:17,18:18,19:19,20:20,21:21,22:22,23:23,24:24,25:25,26:26,27:27,28:28,29:29,30:30,31:31,32:32,33:33,34:34,35:35,36:36,37:37,38:38,39:39,40:40,41:41,42:42,43:43,44:44,45:45,46:46,47:47,48:48,49:49,50:50,51:51,52:52,53:53,54:54,55:55,56:56,57:57,58:58,59:59,60:60,61:61,62:62,63:63,64:64,65:65,66:66,67:67,68:69,69:70,70:71,71:72,72:73,73:74,74:75,75:76,76:77,77:79,78:80,79:81,80:82,81:83,82:84,83:85,84:86,85:87,86:88,87:89,88:90,89:91,90:92,91:93,92:94,93:95,94:96,95:97,96:98,97:100,98:101,99:102,100:103,101:104,102:105,103:106,104:107,105:108,106:109,107:110,108:111,109:112,110:113,111:114,112:115,113:116,114:117,115:118,116:119,117:120,118:121,119:123,120:124,121:125,122:126,123:127,124:128,125:129,126:130,127:131,128:132,129:133,130:134,131:135,132:136,133:137,134:138,135:139,136:140,137:141,138:142,139:143,140:144,141:145,142:146,143:147,144:148,145:149,146:150,147:151,148:152,149:153,150:154,151:155,152:1

1040:1058,1041:1059,1042:1060,1043:1061,1044:1062,1045:1063,1046:1064,1047:1065,1048:1066,1049:1067,1050:1068,1051:1069,1052:1070,1053:1071,1054:1072,1055:1073,1056:1074,1057:1075,1058:1076,1059:1077,1060:1078,1061:1079,1062:1080,1063:1081,1064:1082,1065:1083,1066:1084,1067:1085,1068:1086,1069:1087,1070:1088,1071:1089,1072:1090,1073:1091,1074:1092,1075:1093,1076:1094,1077:1095,1078:1096,1079:1097,1080:1098,1081:1099,1082:1100,1083:1101,1084:1102,1085:1103,1086:1104,1087:1105,1088:1106,1089:1107,1090:1108,1091:1109,1092:1110,1093:1111,1094:1112,1095:1113,1096:1114,1097:1115,1098:1116,1099:1117,1100:1118,1101:1119,1102:1120,1103:1121,1104:1122,1105:1123,1106:1124,1107:1125,1108:1126,1109:1127,1110:1128,1111:1129,1112:1130,1113:1131,1114:1132,1115:1133,1116:1134,1117:1135,1118:1136,1119:1137,1120:1138,1121:1139,1122:1140,1123:1141,1124:1142,1125:1143,1126:1144,1127:1145,1128:1146,1129:1147,1130:1148,1131:1149,1132:1150,1133:1151,1134:1152,1135:1153,1136:1154,1137:1155,1138:1156,1139:1157,

1860:1903,1861:1904,1862:1905,1863:1906,1864:1907,1865:1908,1866:1909,1867:1910,1868:1911,1869:1912,1870:1913,1871:1915,1872:1916,1873:1917,1874:1918,1875:1919,1876:1920,1877:1921,1878:1922,1879:1923,1880:1924,1881:1925,1882:1926,1883:1927,1884:1928,1885:1929,1886:1930,1887:1931,1888:1932,1889:1934,1890:1935,1891:1936,1892:1937,1893:1938,1894:1939,1895:1940,1896:1941,1897:1942,1898:1943,1899:1944,1900:1945,1901:1946,1902:1947,1903:1948,1904:1949,1905:1950,1906:1951,1907:1953,1908:1954,1909:1955,1910:1956,1911:1957,1912:1958,1913:1959,1914:1960,1915:1961,1916:1962,1917:1963,1918:1964,1919:1965,1920:1966,1921:1967,1922:1969,1923:1970,1924:1971,1925:1972,1926:1973,1927:1974,1928:1975,1929:1976,1930:1977,1931:1978,1932:1979,1933:1980,1934:1981,1935:1982,1936:1983,1937:1984,1938:1985,1939:1986,1940:1987,1941:1988,1942:1989,1943:1990,1944:1991,1945:1992,1946:1993,1947:1994,1948:1995,1949:1996,1950:1997,1951:1998,1952:1999,1953:2000,1954:2001,1955:2002,1956:2003,1957:2004,1958:2005,1959:2006,

2680:2789,2681:2790,2682:2791,2683:2792,2684:2793,2685:2794,2686:2795,2687:2796,2688:2797,2689:2798,2690:2799,2691:2800,2692:2801,2693:2802,2694:2803,2695:2804,2696:2805,2697:2806,2698:2807,2699:2808,2700:2809,2701:2810,2702:2811,2703:2812,2704:2813,2705:2814,2706:2815,2707:2816,2708:2817,2709:2818,2710:2819,2711:2820,2712:2821,2713:2822,2714:2823,2715:2824,2716:2825,2717:2826,2718:2827,2719:2828,2720:2829,2721:2830,2722:2831,2723:2832,2724:2833,2725:2834,2726:2835,2727:2836,2728:2837,2729:2838,2730:2839,2731:2840,2732:2841,2733:2842,2734:2843,2735:2844,2736:2845,2737:2846,2738:2847,2739:2848,2740:2849,2741:2850,2742:2851,2743:2852,2744:2853,2745:2854,2746:2855,2747:2856,2748:2857,2749:2858,2750:2859,2751:2860,2752:2861,2753:2862,2754:2863,2755:2864,2756:2866,2757:2867,2758:2868,2759:2869,2760:2870,2761:2871,2762:2872,2763:2873,2764:2874,2765:2875,2766:2876,2767:2877,2768:2878,2769:2879,2770:2880,2771:2881,2772:2882,2773:2883,2774:2884,2775:2885,2776:2886,2777:2887,2778:2888,2779:2889,

In [9]:
final_df.describe()

Unnamed: 0,0,1,3,4,5,6,7,8,9,10,...,3223,3226,3251,3252,3257,3259,3297,3309,3308,3328
count,80092.0,80092.0,80092.0,80092.0,80092.0,80092.0,80092.0,80092.0,80092.0,80092.0,...,80092.0,80092.0,80092.0,80092.0,80092.0,80092.0,80092.0,80092.0,80092.0,80092.0
mean,1.07437,1.178028,1.075224,1.082639,1.185799,1.054346,1.087777,1.126271,1.126796,1.151361,...,1.235722,1.215124,1.164998,1.210075,1.208346,1.151351,1.192119,1.20346,1.151222,1.232358
std,0.085014,0.11866,0.085801,0.090504,0.119329,0.075623,0.091017,0.101214,0.10106,0.11351,...,0.13638,0.13308,0.116582,0.130289,0.128865,0.109636,0.125004,0.126989,0.111393,0.1368
min,1.008749,1.021401,1.008749,1.008749,1.023054,1.008749,1.008749,1.014768,1.014975,1.015777,...,1.029287,1.024475,1.018564,1.024446,1.02452,1.017983,1.022009,1.024376,1.01708,1.027434
25%,1.032945,1.105087,1.033258,1.0366,1.111498,1.022383,1.040345,1.069426,1.070015,1.083898,...,1.142728,1.127348,1.094561,1.124986,1.124417,1.086994,1.112975,1.121677,1.085635,1.139538
50%,1.056376,1.152988,1.057041,1.063671,1.162179,1.037506,1.070073,1.10794,1.108519,1.129071,...,1.215718,1.192808,1.141267,1.187746,1.186272,1.130249,1.167899,1.180966,1.129703,1.211841
75%,1.096149,1.234817,1.097358,1.10794,1.244512,1.067256,1.114615,1.161188,1.161823,1.200212,...,1.310449,1.286153,1.218242,1.278794,1.276099,1.197614,1.255231,1.269113,1.198371,1.307267
max,2.922579,3.126275,2.935046,2.986986,3.111602,2.833349,2.970949,3.013818,3.009374,3.110916,...,3.232468,3.220506,3.126003,3.196775,3.179592,3.067402,3.168722,3.175621,3.094526,3.237162


In [8]:
final_df.to_csv('/home/gaurav/Desktop/RecEng/final.csv')

In [46]:
posts=rec_model.df_in['PostId'].unique()
users=rec_model.df_in['UserId'].unique()
final_df=pd.DataFrame(index=users)
model=rec_model.get_model()
splits=len(posts)//4+1
posts_split_array=np.array_split(posts,splits)


i=1
start = time.time()
for splits in posts_split_array:
    posts_array=np.empty([0,1])
    for pid in splits:
        rec_posts_uid=[]
        post_one_user=np.full(shape=len(users),fill_value=pid)
        posts_array=np.append(posts_array,post_one_user)
    est=model.predict([users,posts_array])
    final_df[pid]=est
    print(str(i)+':'+str(pid),end=',')
    i+=1
print("Complete")
end = time.time()
print('total time (s)= ' + str(end-start))

model loaded
1:4,2:8,3:12,4:16,5:20,6:24,7:28,8:32,9:36,10:40,11:44,

KeyboardInterrupt: 

In [47]:
final_df

Unnamed: 0,4,8,12,16,20,24,28,32,36,40,44
49,1.058512,1.167418,1.111606,1.114046,1.184893,1.293424,1.096650,1.141460,1.137356,1.146810,1.138551
85,1.076961,1.206286,1.135601,1.139039,1.226323,1.344079,1.119638,1.174951,1.168152,1.181774,1.169245
8687,1.017772,1.064865,1.037889,1.037462,1.070304,1.125069,1.030047,1.048681,1.047737,1.054665,1.050162
13505,1.040280,1.128619,1.083355,1.084631,1.139083,1.231679,1.069888,1.107636,1.104648,1.113417,1.106381
19386,1.011353,1.045094,1.025972,1.025183,1.048301,1.097060,1.019903,1.033205,1.032945,1.038128,1.035116
34746,2.111150,2.247125,2.173651,2.192140,2.280661,2.393679,2.166864,2.237280,2.221561,2.244598,2.216240
48204,1.026252,1.090504,1.054884,1.055121,1.098006,1.161676,1.044506,1.072487,1.070675,1.078275,1.072773
51645,1.124928,1.303012,1.210979,1.218950,1.329548,1.469238,1.186602,1.269650,1.259734,1.275421,1.259471
45,1.106815,1.265514,1.179689,1.186412,1.289256,1.420065,1.156593,1.233227,1.224506,1.239269,1.224698
54351,1.054578,1.159350,1.106454,1.108671,1.176306,1.283934,1.091597,1.135466,1.131543,1.141024,1.132902


In [44]:
a=np.empty([0,1])
a

array([], shape=(0, 1), dtype=float64)

In [45]:
a=np.append(a,[1,1,2])
a

array([1., 1., 2.])

In [35]:
post_1

[array([0, 1, 3, 4, 5, 6, 7, 8, 9]),
 array([10, 11, 12, 13, 14, 15, 16, 17, 18]),
 array([19, 20, 21, 22, 23, 24, 25, 26, 27]),
 array([28, 29, 30, 31, 32, 33, 34, 35, 36]),
 array([37, 38, 39, 40, 41, 42, 43, 44, 45]),
 array([46, 47, 48, 49, 50, 51, 52, 53, 54]),
 array([55, 56, 57, 58, 59, 60, 61, 62, 63]),
 array([64, 65, 66, 67, 69, 70, 71, 72, 73]),
 array([74, 75, 76, 77, 79, 80, 81, 82, 83]),
 array([84, 85, 86, 87, 88, 89, 90, 91, 92]),
 array([ 93,  94,  95,  96,  97,  98, 100, 101, 102]),
 array([103, 104, 105, 106, 107, 108, 109, 110, 111]),
 array([112, 113, 114, 115, 116, 117, 118, 119, 120]),
 array([121, 123, 124, 125, 126, 127, 128, 129, 130]),
 array([131, 132, 133, 134, 135, 136, 137, 138, 139]),
 array([140, 141, 142, 143, 144, 145, 146, 147, 148]),
 array([149, 150, 151, 152, 153, 154, 155, 156, 157]),
 array([158, 159, 160, 161, 162, 163, 164, 165, 166]),
 array([167, 168, 169, 170, 171, 172, 173, 174, 175]),
 array([176, 177, 178, 179, 180, 181, 182, 183, 184]),

In [29]:
from joblib import Parallel, delayed

posts=rec_model.df_in['PostId'].unique()

def make_final_df(pos):
    users=rec_model.df_in['UserId'].unique()
    final_df=pd.DataFrame(index=users)
    model=rec_model.get_model()
    i=1
    cdef int pid
    for pid in range(posts.shape[0]):
        rec_posts_uid=[]
        post=np.full(shape=len(users),fill_value=pid)
        est=model.predict([users,post])
        final_df[pid]=est
        print(str(i)+':'+str(pid),end=',')
        i+=1
    return final_df

        


SyntaxError: invalid syntax (<ipython-input-29-65199b75b547>, line 12)

In [20]:
results

NameError: name 'results' is not defined

In [13]:
from multiprocessing import Pool

def f(x):
    return x*x

if __name__ == '__main__':
    p = Pool(5)
    d = range(10000)
    start = time.time()
    print(p.map(f, d))
    

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256, 289, 324, 361, 400, 441, 484, 529, 576, 625, 676, 729, 784, 841, 900, 961, 1024, 1089, 1156, 1225, 1296, 1369, 1444, 1521, 1600, 1681, 1764, 1849, 1936, 2025, 2116, 2209, 2304, 2401, 2500, 2601, 2704, 2809, 2916, 3025, 3136, 3249, 3364, 3481, 3600, 3721, 3844, 3969, 4096, 4225, 4356, 4489, 4624, 4761, 4900, 5041, 5184, 5329, 5476, 5625, 5776, 5929, 6084, 6241, 6400, 6561, 6724, 6889, 7056, 7225, 7396, 7569, 7744, 7921, 8100, 8281, 8464, 8649, 8836, 9025, 9216, 9409, 9604, 9801, 10000, 10201, 10404, 10609, 10816, 11025, 11236, 11449, 11664, 11881, 12100, 12321, 12544, 12769, 12996, 13225, 13456, 13689, 13924, 14161, 14400, 14641, 14884, 15129, 15376, 15625, 15876, 16129, 16384, 16641, 16900, 17161, 17424, 17689, 17956, 18225, 18496, 18769, 19044, 19321, 19600, 19881, 20164, 20449, 20736, 21025, 21316, 21609, 21904, 22201, 22500, 22801, 23104, 23409, 23716, 24025, 24336, 24649, 24964, 25281, 25600, 25921, 26244, 2656