In [1]:
import csv
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse.linalg import svds, eigs
from tqdm.notebook import tqdm

import networkx as nx
from collections import Counter

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
train_graph = nx.read_edgelist(
    'train_pos_after_eda.csv',
    delimiter=',',
    create_using=nx.DiGraph(),
    nodetype=int
)

print(f"Name: {train_graph.name}")
print(f"Type: {type(train_graph)}")
print(f"Number of nodes: {train_graph.number_of_nodes()}")
print(f"Number of edges: {train_graph.number_of_edges()}")

Name: 
Type: <class 'networkx.classes.digraph.DiGraph'>
Number of nodes: 1780722
Number of edges: 7550015


### Similarity Measures

#### Jaccard Distance

In [3]:
# for followees
def jaccard_for_followees(a,b):
  try:
    if len(set(train_graph.successors(a))) == 0 | len(set(train_graph.successors(b))) == 0:
      return 0
    sim = (len(set(train_graph.successors(a)).intersection(set(train_graph.successors(b)))))/(len(set(train_graph.successors(a)).union(set(train_graph.successors(b)))))
  except:
    return 0
  return sim

In [4]:
# one test case
print(jaccard_for_followees(273084, 1505602))

0.0


In [5]:
# node 1635354 not in graph
print(jaccard_for_followees(273084, 1505602))

0.0


In [6]:
# for followers
def jaccard_for_followers(a,b):
  try:
    if len(set(train_graph.predecessors(a))) == 0 | len(set(train_graph.predecessors(b))) == 0:
     return 0
    sim = (len(set(train_graph.predecessors(a)).intersection(set(train_graph.predecessors(b)))))/(len(set(train_graph.predecessors(a)).union(set(train_graph.predecessors(b)))))
    return sim
  except:
    return 0

In [7]:
print(jaccard_for_followers(273084, 470294))

0.0


In [8]:
# node 1635354 not in graph
print(jaccard_for_followers(669354, 1635354))

0


#### Cosine Distance

In [9]:
import math
# for followees
def cosine_for_followees(a, b):
  try:
    if len(set(train_graph.successors(a))) == 0 | len(set(train_graph.successors(b))) == 0:
      return 0
    sim = (len(set(train_graph.successors(a)).intersection(set(train_graph.successors(b)))))/(math.sqrt(len(set(train_graph.successors(a)))*len(set(train_graph.successors(b)))))
    return sim
  except:
    return 0

In [10]:
print(cosine_for_followees(273084, 1505602))

0.0


In [11]:
print(cosine_for_followees(273084, 1635354))

0


In [12]:
def cosine_for_followers(a, b):
  try:
    if len(set(train_graph.predecessors(a))) == 0 | len(set(train_graph.predecessors(b))) == 0:
      return 0
    sim = (len(set(train_graph.predecessors(a)).intersection(set(train_graph.predecessors(b)))))/(math.sqrt(len(set(train_graph.predecessors(a))))*len(set(train_graph.predecessors(b))))
    return sim
  except:
    return 0

In [13]:
print(cosine_for_followers(2, 470294))

0.02886751345948129


In [14]:
print(cosine_for_followers(669354, 1635354))

0


### Page Ranking

In [15]:
pr = nx.pagerank(train_graph, alpha=0.85)
pickle.dump(pr, open('page_rank.pkl', 'wb'))

In [16]:
print('Min:',pr[min(pr, key=pr.get)])
print('Max:',pr[max(pr, key=pr.get)])
print('Mean:',float(sum(pr.values())) / len(pr))

Min: 1.6556497245737814e-07
Max: 2.709825134193587e-05
Mean: 5.615699699389075e-07


In [17]:
#for imputing to nodes which are not there in Train data
mean_pr = float(sum(pr.values())) / len(pr)
print(mean_pr)

5.615699699389075e-07


### Other Graph Features

#### Shortest Path

In [18]:
# if has direct edge then deleting that edge and calculating shortest path
def compute_shortest_path_length(a, b):
  p=-1
  try:
    if train_graph.has_edge(a,b):
      train_graph.remove_edge(a,b)
      p= nx.shortest_path_length(train_graph,source=a,target=b)
      train_graph.add_edge(a,b)
    else:
      p= nx.shortest_path_length(train_graph,source=a,target=b)
    return p
  except:
    return -1

In [19]:
# testing
compute_shortest_path_length(77697, 826021)

10

In [20]:
# testing
compute_shortest_path_length(669354, 1635354)

-1

#### Checking for same weakly connected component (community)

In [21]:
%%time
# getting weakly connected edges from graph
wcc = list(nx.weakly_connected_components(train_graph))

CPU times: user 13.9 s, sys: 170 ms, total: 14.1 s
Wall time: 14.1 s


In [22]:
def belong_to_same_wcc(a, b):
  index = []
  if train_graph.has_edge(b,a):
    return 1
  if train_graph.has_edge(a,b):
    for i in wcc:
      if a in i:
        index = i
        break
    if (b in index):
      train_graph.remove_edge(a,b)
      if compute_shortest_path_length(a, b) == -1:
        train_graph.add_edge(a,b)
        return 0
      else:
        train_graph.add_edge(a,b)
        return 1
    else:
      return 0

In [23]:
print(belong_to_same_wcc(861, 1659750))

0


In [24]:
print(belong_to_same_wcc(669354, 1635354))

None


#### Adamic/Adar Index

In [25]:
# adar index
def calc_adar_in(a, b):
  sum = 0
  try:
    n = list(set(train_graph.successors(a)).intersection(set(train_graph.successors(b))))
    if len(n) != 0:
      for i in n:
        sum = sum + (1/np.log10(len(list(train_graph.predecessors(i)))))
      return sum
    else:
      return 0
  except:
    return 0

In [26]:
calc_adar_in(1, 189226)

0

In [27]:
calc_adar_in(669354, 1635354)

0

#### Is person following back

In [28]:
def follows_back(a, b):
  if train_graph.has_edge(b, a):
    return 1
  else:
    return 0

In [29]:
follows_back(1, 189226)

1

In [30]:
follows_back(669354, 1635354)

0

#### Katz Centrality

In [31]:
%%time
katz = nx.katz.katz_centrality(train_graph, alpha=0.005, beta=1)

CPU times: user 3min 56s, sys: 1.53 s, total: 3min 57s
Wall time: 4min 3s


In [32]:
pickle.dump(katz, open('katz.pkl', 'wb'))

In [33]:
print(f"Min: {katz[min(katz, key=katz.get)]}")
print(f"Max: {katz[max(katz, key=katz.get)]}")
print(f"Mean: {float(sum(katz.values())) / len(katz)}")

Min: 0.0007313532484055978
Max: 0.003394554981694509
Mean: 0.0007483800935553944


In [34]:
mean_katz = float(sum(katz.values())) / len(katz)
print(mean_katz)

0.0007483800935553944


#### Hit Score

In [35]:
%%time
hits = nx.hits(train_graph, max_iter=100, tol=1e-08, nstart=None, normalized=True)

CPU times: user 1min 28s, sys: 2.53 s, total: 1min 31s
Wall time: 1min 25s


In [36]:
pickle.dump(hits, open('hits.pkl', 'wb'))

In [37]:
print(f"Min: {hits[0][min(hits[0], key=hits[0].get)]}")
print(f"Max: {hits[0][max(hits[0], key=hits[0].get)]}")
print(f"Mean: {float(sum(hits[0].values())) / len(hits[0])}")

Min: -5.759763531919088e-21
Max: 0.004868653379538981
Mean: 5.615699699308678e-07


### Featurization

#### Reading a sample of Data from both train and test

In [38]:
%%time
filename = 'train_after_eda.csv'

# n_train = sum([1 for line in open(filename)])
n_train = 15100028
s = 100000 #desired sample size
skip_train = sorted(random.sample(range(1, n_train+1), n_train-s))

CPU times: user 31.7 s, sys: 630 ms, total: 32.3 s
Wall time: 32.2 s


In [39]:
%%time
filename = 'test_after_eda.csv'

# n_test = sum([1 for line in open(filename)])
n_test = 3775006
s = 50000 #desired sample size
skip_test = sorted(random.sample(range(1, n_test+1), n_test-s))

CPU times: user 7.34 s, sys: 134 ms, total: 7.47 s
Wall time: 7.54 s


In [40]:
print("Number of rows in the tarin data:", n_train)
print("Number of rows we are going to eliminate in train data are:", len(skip_train))
print("Number of rows in the test data:", n_test)
print("Number of rows we are going to eliminate in test data are:", len(skip_test))

Number of rows in the tarin data: 15100028
Number of rows we are going to eliminate in train data are: 15000028
Number of rows in the test data: 3775006
Number of rows we are going to eliminate in test data are: 3725006


In [41]:
%%time
df_final_train = pd.read_csv('train_after_eda.csv', skiprows=skip_train, names=['source_node', 'destination_node'])
df_final_train['indicator_link'] = pd.read_csv('train_y.csv', skiprows=skip_train, names=['indicator_link'])
print("Our train matrix size ", df_final_train.shape)
df_final_train.head(2)

Our train matrix size  (100002, 3)
CPU times: user 9.04 s, sys: 2.33 s, total: 11.4 s
Wall time: 11.4 s


Unnamed: 0,source_node,destination_node,indicator_link
0,273084,1505602,1
1,1252341,1374700,1


In [42]:
%%time
df_final_test = pd.read_csv('test_after_eda.csv', skiprows=skip_test, names=['source_node', 'destination_node'])
df_final_test['indicator_link'] = pd.read_csv('test_y.csv', skiprows=skip_test, names=['indicator_link'])
print("Our test matrix size ", df_final_test.shape)
df_final_test.head(2)

Our test matrix size  (50002, 3)
CPU times: user 1.47 s, sys: 400 ms, total: 1.87 s
Wall time: 1.86 s


Unnamed: 0,source_node,destination_node,indicator_link
0,848424,784690,1
1,111353,1121258,1


#### Adding a set of features

In [43]:
def compute_features_stage1(df_final):
    #calculating no of followers followees for source and destination
    #calculating intersection of followers and followees for source and destination
    num_followers_s=[]
    num_followees_s=[]
    num_followers_d=[]
    num_followees_d=[]
    inter_followers=[]
    inter_followees=[]
    for i,row in df_final.iterrows():
        try:
            s1=set(train_graph.predecessors(row['source_node']))
            s2=set(train_graph.successors(row['source_node']))
        except:
            s1 = set()
            s2 = set()
        try:
            d1=set(train_graph.predecessors(row['destination_node']))
            d2=set(train_graph.successors(row['destination_node']))
        except:
            d1 = set()
            d2 = set()
        num_followers_s.append(len(s1))
        num_followees_s.append(len(s2))

        num_followers_d.append(len(d1))
        num_followees_d.append(len(d2))

        inter_followers.append(len(s1.intersection(d1)))
        inter_followees.append(len(s2.intersection(d2)))

    return num_followers_s, num_followers_d, num_followees_s, num_followees_d, inter_followers, inter_followees

In [44]:
%%time
print("Computing features for train data...")
df_final_train['num_followers_s'], df_final_train['num_followers_d'], \
df_final_train['num_followees_s'], df_final_train['num_followees_d'], \
df_final_train['inter_followers'], df_final_train['inter_followees']= compute_features_stage1(df_final_train)

print("Computing features for test data...")
df_final_test['num_followers_s'], df_final_test['num_followers_d'], \
df_final_test['num_followees_s'], df_final_test['num_followees_d'], \
df_final_test['inter_followers'], df_final_test['inter_followees']= compute_features_stage1(df_final_test)

Computing features for train data...
Computing features for test data...
CPU times: user 10.6 s, sys: 28.4 ms, total: 10.6 s
Wall time: 10.7 s


In [45]:
df_final_train.head()

Unnamed: 0,source_node,destination_node,indicator_link,num_followers_s,num_followers_d,num_followees_s,num_followees_d,inter_followers,inter_followees
0,273084,1505602,1,11,6,15,8,0,0
1,1252341,1374700,1,10,1,14,3,0,1
2,984700,1048062,1,6,29,11,52,0,0
3,1788309,1813213,1,26,10,31,0,0,0
4,979020,485390,1,10,1,14,0,0,0


In [46]:
df_final_test.head()

Unnamed: 0,source_node,destination_node,indicator_link,num_followers_s,num_followers_d,num_followees_s,num_followees_d,inter_followers,inter_followees
0,848424,784690,1,6,14,6,9,1,0
1,111353,1121258,1,7,0,12,1,0,0
2,1327403,844838,1,2,6,1,9,0,0
3,380260,917228,1,0,4,0,6,0,0
4,784351,1395044,1,8,2,8,4,0,1


#### Adding new set of features

In [47]:
#mapping adar index on train
df_final_train['adar_index'] = df_final_train.apply(lambda row: calc_adar_in(row['source_node'],row['destination_node']),axis=1)
#mapping adar index on test
df_final_test['adar_index'] = df_final_test.apply(lambda row: calc_adar_in(row['source_node'],row['destination_node']),axis=1)

#--------------------------------------------------------------------------------------------------------
#mapping followback or not on train
df_final_train['follows_back'] = df_final_train.apply(lambda row: follows_back(row['source_node'],row['destination_node']),axis=1)

#mapping followback or not on test
df_final_test['follows_back'] = df_final_test.apply(lambda row: follows_back(row['source_node'],row['destination_node']),axis=1)

#--------------------------------------------------------------------------------------------------------
#mapping same component of wcc or not on train
df_final_train['same_comp'] = df_final_train.apply(lambda row: belong_to_same_wcc(row['source_node'],row['destination_node']),axis=1)

##mapping same component of wcc or not on train
df_final_test['same_comp'] = df_final_test.apply(lambda row: belong_to_same_wcc(row['source_node'],row['destination_node']),axis=1)

#--------------------------------------------------------------------------------------------------------
#mapping shortest path on train
df_final_train['shortest_path'] = df_final_train.apply(lambda row: compute_shortest_path_length(row['source_node'],row['destination_node']),axis=1)
#mapping shortest path on test
df_final_test['shortest_path'] = df_final_test.apply(lambda row: compute_shortest_path_length(row['source_node'],row['destination_node']),axis=1)

#### Adding new set of features

##### Weight Features

In [48]:
#weight for source and destination of each link
Weight_in = {}
Weight_out = {}
for i in  tqdm(train_graph.nodes()):
    s1=set(train_graph.predecessors(i))
    w_in = 1.0/(np.sqrt(1+len(s1)))
    Weight_in[i]=w_in

    s2=set(train_graph.successors(i))
    w_out = 1.0/(np.sqrt(1+len(s2)))
    Weight_out[i]=w_out

#for imputing with mean
mean_weight_in = np.mean(list(Weight_in.values()))
mean_weight_out = np.mean(list(Weight_out.values()))

  0%|          | 0/1780722 [00:00<?, ?it/s]

In [49]:
#mapping to pandas train
df_final_train['weight_in'] = df_final_train.destination_node.apply(lambda x: Weight_in.get(x,mean_weight_in))
df_final_train['weight_out'] = df_final_train.source_node.apply(lambda x: Weight_out.get(x,mean_weight_out))

#mapping to pandas test
df_final_test['weight_in'] = df_final_test.destination_node.apply(lambda x: Weight_in.get(x,mean_weight_in))
df_final_test['weight_out'] = df_final_test.source_node.apply(lambda x: Weight_out.get(x,mean_weight_out))


#some features engineerings on the in and out weights
df_final_train['weight_f1'] = df_final_train.weight_in + df_final_train.weight_out
df_final_train['weight_f2'] = df_final_train.weight_in * df_final_train.weight_out
df_final_train['weight_f3'] = (2*df_final_train.weight_in + 1*df_final_train.weight_out)
df_final_train['weight_f4'] = (1*df_final_train.weight_in + 2*df_final_train.weight_out)

#some features engineerings on the in and out weights
df_final_test['weight_f1'] = df_final_test.weight_in + df_final_test.weight_out
df_final_test['weight_f2'] = df_final_test.weight_in * df_final_test.weight_out
df_final_test['weight_f3'] = (2*df_final_test.weight_in + 1*df_final_test.weight_out)
df_final_test['weight_f4'] = (1*df_final_test.weight_in + 2*df_final_test.weight_out)

In [50]:
%%time

#page rank for source and destination in Train and Test
#if anything not there in train graph then adding mean page rank
df_final_train['page_rank_s'] = df_final_train.source_node.apply(lambda x:pr.get(x,mean_pr))
df_final_train['page_rank_d'] = df_final_train.destination_node.apply(lambda x:pr.get(x,mean_pr))

df_final_test['page_rank_s'] = df_final_test.source_node.apply(lambda x:pr.get(x,mean_pr))
df_final_test['page_rank_d'] = df_final_test.destination_node.apply(lambda x:pr.get(x,mean_pr))
#================================================================================

#Katz centrality score for source and destination in Train and test
#if anything not there in train graph then adding mean katz score
df_final_train['katz_s'] = df_final_train.source_node.apply(lambda x: katz.get(x,mean_katz))
df_final_train['katz_d'] = df_final_train.destination_node.apply(lambda x: katz.get(x,mean_katz))

df_final_test['katz_s'] = df_final_test.source_node.apply(lambda x: katz.get(x,mean_katz))
df_final_test['katz_d'] = df_final_test.destination_node.apply(lambda x: katz.get(x,mean_katz))
#================================================================================

#Hits algorithm score for source and destination in Train and test
#if anything not there in train graph then adding 0
df_final_train['hubs_s'] = df_final_train.source_node.apply(lambda x: hits[0].get(x,0))
df_final_train['hubs_d'] = df_final_train.destination_node.apply(lambda x: hits[0].get(x,0))

df_final_test['hubs_s'] = df_final_test.source_node.apply(lambda x: hits[0].get(x,0))
df_final_test['hubs_d'] = df_final_test.destination_node.apply(lambda x: hits[0].get(x,0))
#================================================================================

#Hits algorithm score for source and destination in Train and Test
#if anything not there in train graph then adding 0
df_final_train['authorities_s'] = df_final_train.source_node.apply(lambda x: hits[1].get(x,0))
df_final_train['authorities_d'] = df_final_train.destination_node.apply(lambda x: hits[1].get(x,0))

df_final_test['authorities_s'] = df_final_test.source_node.apply(lambda x: hits[1].get(x,0))
df_final_test['authorities_d'] = df_final_test.destination_node.apply(lambda x: hits[1].get(x,0))
#================================================================================

CPU times: user 1.31 s, sys: 7.9 ms, total: 1.32 s
Wall time: 1.32 s


#### SVD feature for both source and destination

In [51]:
def svd(x, S):
    try:
        z = sadj_dict[x]
        return S[z]
    except:
        return [0,0,0,0,0,0]

In [52]:
%%time

#for svd features to get feature vector creating a dict node val and inedx in svd vector
sadj_col = sorted(train_graph.nodes())
sadj_dict = { val:idx for idx,val in enumerate(sadj_col)}

CPU times: user 2.79 s, sys: 152 ms, total: 2.94 s
Wall time: 2.99 s


In [53]:
%%time

Adj = nx.adjacency_matrix(train_graph,nodelist=sorted(train_graph.nodes())).asfptype()

CPU times: user 52.4 s, sys: 1.63 s, total: 54 s
Wall time: 53.9 s


In [54]:
%%time

U, s, V = svds(Adj, k = 6)
print('Adjacency matrix Shape',Adj.shape)
print('U Shape',U.shape)
print('V Shape',V.shape)
print('s Shape',s.shape)

Adjacency matrix Shape (1780722, 1780722)
U Shape (1780722, 6)
V Shape (6, 1780722)
s Shape (6,)
CPU times: user 44.8 s, sys: 1.27 s, total: 46 s
Wall time: 32 s


In [55]:
%%time

#===================================================================================================

df_final_train[['svd_u_s_1', 'svd_u_s_2','svd_u_s_3', 'svd_u_s_4', 'svd_u_s_5', 'svd_u_s_6']] = \
df_final_train.source_node.apply(lambda x: svd(x, U)).apply(pd.Series)

df_final_train[['svd_u_d_1', 'svd_u_d_2', 'svd_u_d_3', 'svd_u_d_4', 'svd_u_d_5','svd_u_d_6']] = \
df_final_train.destination_node.apply(lambda x: svd(x, U)).apply(pd.Series)
#===================================================================================================

df_final_train[['svd_v_s_1','svd_v_s_2', 'svd_v_s_3', 'svd_v_s_4', 'svd_v_s_5', 'svd_v_s_6',]] = \
df_final_train.source_node.apply(lambda x: svd(x, V.T)).apply(pd.Series)

df_final_train[['svd_v_d_1', 'svd_v_d_2', 'svd_v_d_3', 'svd_v_d_4', 'svd_v_d_5','svd_v_d_6']] = \
df_final_train.destination_node.apply(lambda x: svd(x, V.T)).apply(pd.Series)
#===================================================================================================

df_final_test[['svd_u_s_1', 'svd_u_s_2','svd_u_s_3', 'svd_u_s_4', 'svd_u_s_5', 'svd_u_s_6']] = \
df_final_test.source_node.apply(lambda x: svd(x, U)).apply(pd.Series)

df_final_test[['svd_u_d_1', 'svd_u_d_2', 'svd_u_d_3', 'svd_u_d_4', 'svd_u_d_5','svd_u_d_6']] = \
df_final_test.destination_node.apply(lambda x: svd(x, U)).apply(pd.Series)

#===================================================================================================

df_final_test[['svd_v_s_1','svd_v_s_2', 'svd_v_s_3', 'svd_v_s_4', 'svd_v_s_5', 'svd_v_s_6',]] = \
df_final_test.source_node.apply(lambda x: svd(x, V.T)).apply(pd.Series)

df_final_test[['svd_v_d_1', 'svd_v_d_2', 'svd_v_d_3', 'svd_v_d_4', 'svd_v_d_5','svd_v_d_6']] = \
df_final_test.destination_node.apply(lambda x: svd(x, V.T)).apply(pd.Series)
#===================================================================================================

CPU times: user 1min 54s, sys: 2.45 s, total: 1min 56s
Wall time: 1min 57s


In [56]:
df_final_train.head()

Unnamed: 0,source_node,destination_node,indicator_link,num_followers_s,num_followers_d,num_followees_s,num_followees_d,inter_followers,inter_followees,adar_index,...,svd_v_s_3,svd_v_s_4,svd_v_s_5,svd_v_s_6,svd_v_d_1,svd_v_d_2,svd_v_d_3,svd_v_d_4,svd_v_d_5,svd_v_d_6
0,273084,1505602,1,11,6,15,8,0,0,0.0,...,1.983699e-06,-1.545065e-13,-8.108263e-13,-1.719688e-14,-1.355367e-12,4.675285e-13,1.128588e-06,-6.61665e-14,-9.771062e-13,-4.159993e-14
1,1252341,1374700,1,10,1,14,3,0,1,0.926628,...,4.918216e-14,-4.631206e-15,-3.155779e-15,-1.395068e-16,-1.712474e-19,-9.345474e-21,-3.886368e-19,-1.061186e-19,3.102919e-20,-2.556581e-19
2,984700,1048062,1,6,29,11,52,0,0,0.0,...,4.023688e-12,-4.633236e-14,-2.997645e-11,-4.002844e-16,-1.298131e-12,9.508589e-09,5.260981e-12,-6.742226e-12,-1.70957e-13,-6.255724e-15
3,1788309,1813213,1,26,10,31,0,0,0,0.0,...,2.390787e-05,-6.561545e-12,-9.588017e-12,-7.499915e-13,-5.863976e-13,8.9926e-14,1.104945e-05,-3.036148e-13,-5.105628e-13,-1.197388e-13
4,979020,485390,1,10,1,14,0,0,0,0.0,...,2.983898e-12,-1.476051e-12,-1.410034e-09,-2.568708e-14,3.084229e-20,1.845237e-20,1.0100809999999999e-19,3.804134e-20,-1.9892369999999998e-20,8.285579e-20


In [57]:
df_final_train.shape

(100002, 51)

In [58]:
df_final_test.head()

Unnamed: 0,source_node,destination_node,indicator_link,num_followers_s,num_followers_d,num_followees_s,num_followees_d,inter_followers,inter_followees,adar_index,...,svd_v_s_3,svd_v_s_4,svd_v_s_5,svd_v_s_6,svd_v_d_1,svd_v_d_2,svd_v_d_3,svd_v_d_4,svd_v_d_5,svd_v_d_6
0,848424,784690,1,6,14,6,9,1,0,0.0,...,5.904477e-11,-2.701538e-12,-4.341597e-13,-5.535509e-14,-9.994072e-10,5.791899e-10,3.512025e-07,-2.486659e-09,-2.77112e-09,-1.727661e-12
1,111353,1121258,1,7,0,12,1,0,0,0.0,...,7.590512e-13,-4.724448e-14,-7.165399e-13,-5.845764e-16,1.334453e-19,7.190125999999999e-20,3.7277549999999997e-19,8.189645e-20,-7.358945999999999e-20,2.3599599999999997e-19
2,1327403,844838,1,2,6,1,9,0,0,0.0,...,1.269145e-06,-1.447856e-13,-1.285281e-12,-1.074914e-13,-5.376066e-13,1.166558e-13,4.390138e-07,-1.650798e-14,-3.090519e-13,-1.381274e-14
3,380260,917228,1,0,4,0,6,0,0,0.0,...,0.0,0.0,0.0,0.0,-3.259316e-12,2.872989e-11,1.023766e-10,-2.121218e-12,-8.538985e-10,-1.513135e-12
4,784351,1395044,1,8,2,8,4,0,1,1.430677,...,5.610629e-13,-3.103763e-15,-1.629291e-15,-1.724461e-12,-1.0538160000000001e-17,7.350274e-18,4.551412e-15,-1.307975e-18,-2.8607740000000002e-18,-6.483176e-20


In [59]:
df_final_test.shape

(50002, 51)

In [60]:
%%time
df_final_train.to_csv("final_train_df.csv", index=False)
df_final_test.to_csv("final_test_df.csv", index=False)

CPU times: user 13.6 s, sys: 326 ms, total: 13.9 s
Wall time: 14 s
