# Assignment 4

In [2]:
import networkx as nx
import pandas as pd
import numpy as np
import pickle

---

## Part 1 - Random Graph Identification

For the first part of this assignment you will analyze randomly generated graphs and determine which algorithm created them.

In [3]:
G1 = nx.read_gpickle("assets/A4_P1_G1")
G2 = nx.read_gpickle("assets/A4_P1_G2")
G3 = nx.read_gpickle("assets/A4_P1_G3")
G4 = nx.read_gpickle("assets/A4_P1_G4")
G5 = nx.read_gpickle("assets/A4_P1_G5")
P1_Graphs = [G1, G2, G3, G4, G5]

<br>
`P1_Graphs` is a list containing 5 networkx graphs. Each of these graphs were generated by one of three possible algorithms:
* Preferential Attachment (`'PA'`)
* Small World with low probability of rewiring (`'SW_L'`)
* Small World with high probability of rewiring (`'SW_H'`)

Anaylze each of the 5 graphs using any methodology and determine which of the three algorithms generated each graph.

*The `graph_identification` function should return a list of length 5 where each element in the list is either `'PA'`, `'SW_L'`, or `'SW_H'`.*

In [4]:
for G in P1_Graphs:
    preferential_score = []
    preferential_score = [i[2] for i in nx.preferential_attachment(G)]

In [5]:
def graph_identification():
    # YOUR CODE HERE
    import networkx as nx
    methods = []
    preferential_score = []
    for G in P1_Graphs:
        avg_shortest_path_length = nx.average_shortest_path_length(G)
        clustering_coefficient = nx.average_clustering(G)
        preferential_score = [i[2] for i in nx.preferential_attachment(G)]
        if preferential_score > :
            methods.append('PA')
        elif avg_shortest_path_length < clustering_coefficient:
            methods.append('SW_L')
        else:
            methods.append('SW_H')  
    return methods

SyntaxError: invalid syntax (1110051046.py, line 10)

In [None]:
ans_one = graph_identification()
assert type(ans_one) == list, "You must return a list"


---

## Part 2 - Company Emails

For the second part of this assignment you will be working with a company's email network where each node corresponds to a person at the company, and each edge indicates that at least one email has been sent between two people.

The network also contains the node attributes `Department` and `ManagmentSalary`.

`Department` indicates the department in the company which the person belongs to, and `ManagmentSalary` indicates whether that person is receiving a managment position salary.

In [6]:
G = pickle.load(open('assets/email_prediction_NEW.txt', 'rb'))

print(f"Graph with {len(nx.nodes(G))} nodes and {len(nx.edges(G))} edges")

Graph with 1005 nodes and 16706 edges


### Part 2A - Salary Prediction

Using network `G`, identify the people in the network with missing values for the node attribute `ManagementSalary` and predict whether or not these individuals are receiving a managment position salary.

To accomplish this, you will need to create a matrix of node features of your choice using networkx, train a sklearn classifier on nodes that have `ManagementSalary` data, and predict a probability of the node receiving a managment salary for nodes where `ManagementSalary` is missing.



Your predictions will need to be given as the probability that the corresponding employee is receiving a managment position salary.

The evaluation metric for this assignment is the Area Under the ROC Curve (AUC).

Your grade will be based on the AUC score computed for your classifier. A model which with an AUC of 0.75 or higher will recieve full points.

Using your trained classifier, return a Pandas series of length 252 with the data being the probability of receiving managment salary, and the index being the node id.

    Example:
    
        1       1.0
        2       0.0
        5       0.8
        8       1.0
            ...
        996     0.7
        1000    0.5
        1001    0.0
        Length: 252, dtype: float64

In [7]:
list(G.nodes(data=True))[:5] # print the first 5 nodes

[(0, {'Department': 1, 'ManagementSalary': 0.0}),
 (1, {'Department': 1, 'ManagementSalary': nan}),
 (581, {'Department': 3, 'ManagementSalary': 0.0}),
 (6, {'Department': 25, 'ManagementSalary': 1.0}),
 (65, {'Department': 4, 'ManagementSalary': nan})]

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
def is_management(node):
            managementSalary = node[1]['ManagementSalary']
            if managementSalary == 0:
                return 0
            elif managementSalary == 1:
                return 1
            else:
                return None

df = pd.DataFrame(index = G.nodes())
df['clustering'] = pd.Series(nx.clustering(G))

my_degrees = G.degree()
df['degree'] = [v for k, v in my_degrees]

df['degree_centrality'] = pd.Series(nx.degree_centrality(G))
df['closeness'] = pd.Series(nx.closeness_centrality(G))
df['betweenness'] = pd.Series(nx.betweenness_centrality(G))
df['pr'] = pd.Series(nx.pagerank(G))
df['is_management'] = pd.Series([is_management(node) for node in G.nodes(data=True)])

df_train = df[~pd.isnull(df['is_management'])]
df_test = df[pd.isnull(df['is_management'])]
df_test = df_test.drop('is_management', axis=1)
features = ['clustering', 'degree', 'degree_centrality', 'closeness', 'betweenness', 'pr']
X_train = df_train[features]
Y_train = df_train['is_management']

X_test = df_test[features]
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_scaled, Y_train)
test_proba = clf.predict_proba(X_test_scaled)[:, 1]
final = pd.Series(test_proba,X_test.index)   
final = final.dropna() 

In [70]:
def salary_predictions():
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import SVC
    from sklearn import preprocessing
    from sklearn.neural_network import MLPClassifier
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.model_selection import train_test_split
    # YOUR CODE HERE


    def is_management(node):
        managementSalary = node[1]['ManagementSalary']
        if managementSalary == 0:
            return 0
        elif managementSalary == 1:
            return 1
        else:
            return -1


    df = pd.DataFrame(index=G.nodes())

    # Create new columns
    df['clustering'] = pd.Series(nx.clustering(G))

    # iterate through degreeview
    my_degrees = G.degree()
    df['degree'] = [v for k, v in my_degrees]

    df['degree_centrality'] = pd.Series(nx.degree_centrality(G))
    df['closeness'] = pd.Series(nx.closeness_centrality(G))
    df['betweenness'] = pd.Series(nx.betweenness_centrality(G))
    df['pr'] = pd.Series(nx.pagerank(G))
    df['is_management'] = pd.Series([is_management(node)
                                    for node in G.nodes(data=True)])

    df = df.dropna()

    # separate training and testing data
    feature_list = ['clustering', 'degree',
                    'degree_centrality', 'closeness', 'betweenness', 'pr']
    df_train = df[df['is_management'].isin([0, 1])]
    df_test = df[df['is_management'] == -1]

    X_train = df_train[feature_list]
    y_train = df_train['is_management']
    X_test = df_test[feature_list]
    y_test = df_test['is_management']

    clf = RandomForestClassifier(
        n_estimators=20, max_depth=10).fit(X_train, y_train)
    clf.fit(X_train, y_train)
    results = clf.predict_proba(X_test)[:, 1]
    series = pd.Series(results,X_test.index)
    return clf.score(X_test, y_test)


In [71]:
salary_predictions()

0.0

In [39]:
ans_salary_preds = salary_predictions()
assert type(ans_salary_preds) == pd.core.series.Series, "You must return a Pandas series"
assert len(ans_salary_preds) == 252, "The series must be of length 252"


### Part 2B - New Connections Prediction

For the last part of this assignment, you will predict future connections between employees of the network. The future connections information has been loaded into the variable `future_connections`. The index is a tuple indicating a pair of nodes that currently do not have a connection, and the `Future Connection` column indicates if an edge between those two nodes will exist in the future, where a value of 1.0 indicates a future connection.

In [None]:
future_connections = pd.read_csv('assets/Future_Connections.csv', index_col=0, converters={0: eval})
future_connections.head(10)

Unnamed: 0,Future Connection
"(6, 840)",0.0
"(4, 197)",0.0
"(620, 979)",0.0
"(519, 872)",0.0
"(382, 423)",0.0
"(97, 226)",1.0
"(349, 905)",0.0
"(429, 860)",0.0
"(309, 989)",0.0
"(468, 880)",0.0


Using network `G` and `future_connections`, identify the edges in `future_connections` with missing values and predict whether or not these edges will have a future connection.

To accomplish this, you will need to:      
1. Create a matrix of features of your choice for the edges found in `future_connections` using Networkx     
2. Train a sklearn classifier on those edges in `future_connections` that have `Future Connection` data     
3. Predict a probability of the edge being a future connection for those edges in `future_connections` where `Future Connection` is missing.



Your predictions will need to be given as the probability of the corresponding edge being a future connection.

The evaluation metric for this assignment is the Area Under the ROC Curve (AUC).

Your grade will be based on the AUC score computed for your classifier. A model which with an AUC of 0.75 or higher will recieve full points.

Using your trained classifier, return a series of length 122112 with the data being the probability of the edge being a future connection, and the index being the edge as represented by a tuple of nodes.

    Example:
    
        (107, 348)    0.35
        (542, 751)    0.40
        (20, 426)     0.55
        (50, 989)     0.35
                  ...
        (939, 940)    0.15
        (555, 905)    0.35
        (75, 101)     0.65
        Length: 122112, dtype: float64

In [None]:
def new_connections_predictions():
    
    # create variables which describe the nodes of the network, how well they are connected and 
    pa_preds = nx.preferential_attachment(G,future_connections.index)
    jc_preds = nx.jaccard_coefficient(G,future_connections.index)


    pa_my_l = [] 
    pa_my_val = []

    jc_my_l = [] 
    jc_my_val = []

    my_dfs = [pa_preds, jc_preds]
    count = 0
    for my_df in my_dfs:
        if count == 0:
            for u,v,p in my_df:
                pa_my_l.append((u,v))
                pa_my_val.append(p)
        if count > 0:
            for u,v,p in my_df:
                jc_my_l.append((u,v))
                jc_my_val.append(p)
        count += 1

    # write a code to check if the two nodes are in the same or a different department, if so add either a one or a zero

    pa_preds = nx.preferential_attachment(G,future_connections.index)

    same_dep = []
    for u,v,p in pa_preds:
        if list(G.nodes[u].values())[0] == list(G.nodes[v].values())[0]:
            same_dep.append(1)
            va1 = list(G.nodes[u].values())[0]
            va2 = list(G.nodes[v].values())[0]
            #print(str(va1)+ "and" + str(va2))
        else:
            same_dep.append(0)


    # get the transformed data set 
    df_pa = pd.DataFrame({"pa_score":pa_my_val, "same_depart": same_dep}, index= pa_my_l)
    df_jc = pd.DataFrame({"jc_score":jc_my_val}, index= jc_my_l)            


    df_score = pd.merge(df_jc, df_pa, left_index=True, right_index=True)
    df_fin = pd.merge(future_connections, df_score, left_index=True, right_index=True)

    # machine learning

    # train test set
    test_X = df_fin[df_fin["Future Connection"].isnull()].iloc[:,[1,2,3]]
    train_X = df_fin[df_fin["Future Connection"].notnull()].iloc[:,[0,1,2,3]]
    train_X["Future Connection"] = train_X["Future Connection"].astype(int)


    # going with a logistic regression
    from sklearn.linear_model import LogisticRegression
    reg = LogisticRegression().fit(train_X.iloc[:,[1,2,3]], train_X.iloc[:,0])
    res = reg.predict(test_X.iloc[:,[0,1,2]])


    fin = reg.predict_proba(test_X.iloc[:,[0,1,2]])
    te = [(x[1]) for x in fin]
    res = pd.Series(te, index= test_X.index.values)

    return res

In [None]:
ans_prob_preds = new_connections_predictions()
assert type(ans_prob_preds) == pd.core.series.Series, "You must return a Pandas series"
assert len(ans_prob_preds) == 122112, "The series must be of length 122112"
