In [1]:
# Find Social Circle Connections
# Enter your record number below to view matching connections

In [2]:
connect_my_record = 8

In [3]:
import pandas_profiling as pp
import numpy as np
import pandas as pd
import io
import requests
import sklearn.metrics as sm
from sklearn import datasets
from sklearn.cluster import KMeans
import warnings

# Ignore Jupyter Notebook python warnings
warnings.simplefilter('ignore')

# Read .csv from URL
url="https://raw.githubusercontent.com/daybreaksocialcircles/groupcluster/master/social_circles.csv"
df=pd.read_csv(url)

# Alternate Array formats for different analysis
url_source=requests.get(url).content 
hdf=pd.read_csv(io.StringIO(url_source.decode('utf-8')), header=None)
sc_array = np.array(hdf)

# Store the original column names in a python list
original_headers = list(df.columns.values)

# Remove all but the numeric columns
# Some analysis can only be performed on numeric data
ndf = df._get_numeric_data()

# Store the headers for all numeric columns in a list
numeric_headers = list(ndf.columns.values)

# Change NaN values to 0
NaNs_values = np.isnan(ndf)
ndf[NaNs_values] = 0
sc_matrix = pd.DataFrame.as_matrix(ndf)


def matches(connection1, connection2):
    print(sc_array[connection2][0])
    merged_list = [[sc_array[0][i], sc_array[connection1][i], sc_array[connection2][i]] for i in range(0, len(sc_array[0]))] 
    for i in range(len(sc_array[0])):
        if sc_array[connection1][i] == sc_array[connection2][i]:
            if sc_array[connection1][i] != '' and sc_array[connection1][i] != '0':
                print(i,sc_array[0][i],sc_array[connection1][i] )
    print()

def matches_v(connection1, connection2):
    # print(sc_array[0])
    # print(sc_array[connection1])
    # print(sc_array[connection2])
    merged_list = [[sc_array[0][i], sc_array[connection1][i], sc_array[connection2][i]] for i in range(0, len(sc_array[0]))] 
    for i in range(len(sc_array[0])):
        if sc_array[connection1][i] == sc_array[connection2][i]:
            if sc_array[connection1][i] != '' and sc_array[connection1][i] != '0':
                print(i,sc_array[0][i],sc_array[connection1][i] )
    print()
    print(merged_list)
    print()


# Define Cluster Model
cluster_model = KMeans(n_clusters=30, random_state=10)

# Fit the model to Social Circles Matrix
cluster_model.fit(sc_matrix)

# Display the cluster labels assigned to each record (30 clusters created = 0-29)
cluster_model.labels_

cluster_labels = cluster_model.fit_predict(sc_matrix)

# Add cluster labels to end of data frame
# cp = connection Predictions Data Frame with firend_cluster column appended to the right of array
cp=df
cp['connection_cluster'] = cluster_labels

# Find clusters of connections for '1_Alabama_Alexander_City'

# Store record for '1_Alabama_Alexander_City'

match_me = cp[cp.Surrogate_Key==sc_array[connect_my_record, 0]]
match_cluster_label = match_me['connection_cluster'].iat[0]

# match_me = sc_array[connect_my_record]
# print(match_me)

# Extract the label of cluster from record
# match_cluster_label = match_me.at[0,'connection_cluster']

# Print all records with the name cluster label
abc = cp[cp.connection_cluster==match_cluster_label]
cluster_list = list(abc.index)
for i in cluster_list:
    print('Cluster Record Matched to: ',i)
    matches(connect_my_record, i)

    
# Select a record with which to compare all other records for connections
connection1 = connect_my_record - 1

# Create an array to store prediction values
predict_connection = np.zeros(len(sc_array),dtype=
                           [('record', int), ('interests', int), ('likes', int), ('daughters', int), ('sons', int)]
                          )

# Set the first column to the record number so that we can sort by predictions later and retain record number
for i in range(len(predict_connection)):
    predict_connection[i][0] = i+1

#Calculate values over just interests [ShortHikes...Anime]
predict_array = sc_matrix[0:,:38]

print('Original: ', sc_array[connect_my_record][0])
for i in range(1,len(predict_array)):
    for j in range(len(predict_array[0])):
        predict_connection[i][1] += int(predict_array[i][j]) * int(predict_array[connection1][j])

# Sort in descending order, highest matching score first
predict_connection[::-1].sort(order='interests')

print()
predict_list=predict_connection[:5]['record'].tolist()
for i in predict_list:
    print('Predictive Record Matched to: ',i)
    matches(connect_my_record,i)


Cluster Record Matched to:  1
1_Alabama_Alexander_City
1 Kids Yes
14 ExerciseBudy 1
21 KidsNightOut 1
23 FamilyFunNight 1
24 IceCreamSocial 2
34 CommunityService 1
40 Introvert Yes
41 Extrovert No
45 Short_Walks 1
88 Helping_others_connect 1

Cluster Record Matched to:  7
7_Alabama_Bessemer
2 ShortHikes 2
27 PizzaNights 1
29 ChocolateTasting 2
34 CommunityService 1
42 Restaurants 1
43 Bars 1
45 Short_Walks 1
46 Smokers -1
47 Drinkers 1
53 Thrifty_Activities 1
54 Extravagant_Activities 1
55 Watching_Sports -1
56 Hanging_Out 1
57 Big_Crowds -1
87 Actively_making_friends 1

Cluster Record Matched to:  38
38_Alabama_Tuscumbia
5 BookClub 1
6 GamesNight 1
13 Running 1
14 ExerciseBudy 1
17 Crafting 1
26 WineAndCheese 2
34 CommunityService 1
40 Introvert Yes
41 Extrovert No
42 Restaurants 1
43 Bars 1
46 Smokers -1
49 Religous_Discussions -1
50 Political_Discussions -1
53 Thrifty_Activities 1
56 Hanging_Out 1
57 Big_Crowds -1
58 Loud_Crowds -1

Cluster Record Matched to:  61
61_Arizona_Douglas
