In [478]:
import pandas as pd
import networkx as nx
import json
import matplotlib.pyplot as plt
from operator import itemgetter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [607]:
tweets = []
with open("Gamergate.json", "r", encoding="utf-8") as f:
    for line in f.readlines():
        tweet = json.loads(line)

        tweets.append(tweet)

In [608]:
weight_map = {}
G = nx.DiGraph()
for t in tweets:
    if 'quoted_status_id_str' not in t:
        usr = t['user']['screen_name']
        G.add_node(usr)

        if 'retweeted_status' in t:

            rt = t['retweeted_status']['user']['screen_name']

            if (usr, rt) in weight_map:
                weight_map[(usr, rt)] += 1

            else:
                weight_map[(usr, rt)] = 1

            G.add_edge(usr, rt, weight = weight_map[(usr, rt)])


In [609]:
nx.write_gexf(G, "g.gexf")

In [4]:
print("Number of edges", G.number_of_edges())
print("Number of nodes", G.number_of_nodes())

Number of edges 48545
Number of nodes 21414


In [5]:
max_edge = -1
max_node = ''
end = ''

outgoing = {}
incoming = {}

for u, v, weight in G.edges(data="weight"):
    if weight is not None:
        
        if u in outgoing:
            outgoing[u] += weight
        else:
            outgoing[u] = weight
            
        if v in incoming:
            incoming[v] += weight
        else:
            incoming[v] = weight
        
        if weight > max_edge:
            max_edge = weight
            max_node = u
            end = v
            
print("Max retweeted", max(incoming, key=incoming.get))
print("Max retweeter", max(outgoing, key=outgoing.get))
print(G.in_degree('ChrisWarcraft', weight='weight'))
print(G.out_degree('SomeKindaBoogin', weight='weight'))

Max retweeted ChrisWarcraft
Max retweeter SomeKindaBoogin
4189
1054


In [589]:
mini = []
with open("toy_test/mini_mid_gamergate.json", "r", encoding="utf-8") as f:
    for line in f.readlines():
        tweet = json.loads(line)

        mini.append(tweet)

In [590]:
def edge_to_remove(G):
    
    all_edges = nx.edge_betweenness_centrality(G, weight='weight')
 
    for key, value in sorted(all_edges.items(), key=lambda item: item[1], reverse = True):
 
        maxx = value
        break
        
    edges=[]
    for key, value in sorted(all_edges.items(), key=lambda item: item[1], reverse = True):
        if value == maxx:
            edges.append(key)
            
        else:
            break
        
            
    return edges



In [599]:
def girvan_newman(G, OG):
    

    splits = nx.number_connected_components(G)
    edges = G.number_of_edges()
    max_mod = -100
    
    while(edges > 1):
        
        
        print('edges left', edges)
        to_remove = edge_to_remove(G)
        
        for t in to_remove:
            G.remove_edge(t[0], t[1])
        
        edges = G.number_of_edges()
        
            
       
        old_splits = splits
        splits = nx.number_connected_components(G)
        
        
        if old_splits != splits:
            print('components', splits)
            
            modularity = get_modularity(G, OG)
           
        
            if modularity > max_mod:
                
                
                max_mod = modularity
                print('new max mod:', max_mod)
                nx.write_gexf(G, "graph.gexf")
                id_splits = splits
                partitions = list(nx.connected_components(G))
        
    return partitions, max_mod, id_splits

In [595]:
def get_modularity(G, OG):
    M = OG.size('weight')
    cumulative_modularity = 0

    partitions = nx.connected_components(G)
    
    for parts in partitions:
        part_modularity = 0

        for i in parts:
            for j in parts:
                
                if i != j:
                    
                    ki = OG.degree(i, weight='weight')
                    kj = OG.degree(j, weight='weight')
                    
                    A_bool = G.has_edge(i,j)

                    if A_bool:
                        Aij = G[i][j]['weight']
                    else:
                        Aij = 0
                    
                    rh = (ki*kj)/(2*M)
                    tmp_modularity = Aij - rh
                    
                    
                    part_modularity += tmp_modularity
                    
        
        cumulative_modularity += part_modularity
          
    return cumulative_modularity/(2*M)    
            

In [605]:
weight_map = {}
tweet_map = {}
G = nx.Graph()
OG = nx.Graph()
for t in mini:
    val = True
    if val:
        usr = t['user']['screen_name']
        text = t['text']
        if usr in tweet_map:
            tweet_map[usr] += ' ' + text
        else:
            tweet_map[usr] = text
        G.add_node(usr)
        OG.add_node(usr)
        if 'retweeted_status' in t:

            rt = t['retweeted_status']['user']['screen_name']
            
            rtext = t['retweeted_status']['text']
            if rt in tweet_map:
                tweet_map[rt] += ' ' + rtext
                tweet_map[usr] += ' ' + rtext
            else:
                tweet_map[rt] = rtext
                tweet_map[usr] = rtext

            if (usr, rt) in weight_map:
                weight_map[(usr, rt)] += 1

            else:
                weight_map[(usr, rt)] = 1

            G.add_edge(usr, rt, weight = weight_map[(usr, rt)])
            OG.add_edge(usr, rt, weight = weight_map[(usr, rt)])

In [597]:
grp, mod, comps = girvan_newman(G, OG)

edges left 976
components 431
new max mod: 0.46334002010368114
edges left 975
edges left 974
edges left 973
edges left 972
edges left 971
edges left 970
components 432
new max mod: 0.46951503913091314
edges left 969
edges left 968
edges left 967
edges left 966
edges left 965
edges left 964
edges left 963
edges left 962
edges left 961
edges left 960
edges left 959
edges left 958
edges left 957
edges left 956
edges left 955
edges left 954
edges left 953
edges left 952
edges left 951
edges left 950
edges left 949
edges left 948
edges left 947
edges left 946
edges left 945
edges left 944
edges left 943
edges left 942
edges left 941
edges left 940
edges left 939
edges left 938
edges left 937
edges left 936
edges left 935
edges left 934
edges left 933
edges left 932
edges left 931
edges left 930
edges left 929
edges left 928
edges left 927
edges left 926
edges left 925
edges left 924
edges left 923
edges left 922
edges left 921
edges left 920
edges left 919
edges left 918
components 433
new 

ValueError: too many values to unpack (expected 3)

In [602]:
rg = nx.read_gexf('graph.gexf')

In [604]:
nx.number_connected_components(rg)

451

In [458]:
grp.sort(key=len)
results = []
for g in grp:
    results.append(sorted(list(g)))

rst = sorted(results, key=itemgetter(0))
rst.sort(key=len)

In [465]:
with open('listfile.txt', 'w') as filehandle:
    filehandle.write("Best Modularity is: " + str(mod) + '\n')
    filehandle.writelines("%s\n" % str(place)[1:-1].replace(" ", "") for place in rst)

In [610]:

tweet_map = {}

for t in mini:
    val = True
    if val:
        usr = t['user']['screen_name']
        text = t['text']
        if usr in tweet_map:
            tweet_map[usr] += ' ' + text
        else:
            tweet_map[usr] = text
    
        if 'retweeted_status' in t:

            rt = t['retweeted_status']['user']['screen_name']
            
            rtext = t['retweeted_status']['text']
            if rt in tweet_map:
                tweet_map[rt] += ' ' + rtext

            else:
                tweet_map[rt] = rtext
               



In [611]:
k_comm = rst[-2:]
rest = rst[:-2]
arr = []

cluster = 1
for k in k_comm:
    for usr in k:
        tweet = tweet_map[usr]
            
        bit = (usr, tweet, cluster)
        arr.append(bit)
    cluster -=1

df = pd.DataFrame(arr, columns=['user', 'tweet', 'community'])
train_len = len(df)

arr = []

for k in rest:
    for usr in k:
        tweet = tweet_map[usr]

        bit = (usr, tweet, -1)
        arr.append(bit)
df2 = pd.DataFrame(arr, columns=['user', 'tweet', 'community'])
df = df.append(df2)

In [612]:
train = df[:train_len]
test = df[train_len:]

vectorizer = TfidfVectorizer()

features_train = train['tweet']
labels_train = train['community']
features_test = test['tweet']
labels_test = test['community']

features_train = vectorizer.fit_transform(features_train)

In [613]:
clf = MultinomialNB()
clf.fit(features_train, labels_train)
score_train = clf.score(features_train, labels_train)

In [614]:
features_test = vectorizer.transform(features_test)
preds = clf.predict(features_test)
test['community'] = preds
train['community'] = clf.predict(features_train)
final = test.append(train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [619]:
results_map = {}
results_map[0] = []
results_map[1] = []

for index, row in final.iterrows():
    results_map[row['community']].append(row['user'])

with open("task2_B.txt", "w", encoding="utf-8") as f:
    for com in results_map:
        result = results_map[com]
        push = sorted(result)
        f.write("%s\n" % str(push)[1:-1].replace(" ", ""))
    

In [620]:
real = {}
real[0] = []
real[1] = []
with open("toy_test/task2_B.txt", "r", encoding="utf-8") as f:
    ct = 0
    for line in f.readlines():
        word = ''
        for l in line:
            if l == ',':
                real[ct].append(word[1:-1])
                word = ''
            else:
                word+=l
               
        ct+=1
pred = {}
pred[0] = []
pred[1] = []
with open("task2_B.txt", "r", encoding="utf-8") as f:
    ct = 0
    for line in f.readlines():
        word = ''
        for l in line:
            if l == ',':
                pred[ct].append(word[1:-1])
                word = ''
            else:
                word+=l
               
        ct+=1

In [621]:
for p in pred:
    print(len(pred[p]))

216
965


In [622]:
for r in real:
    print(len(real[r]))

216
965


In [623]:
count_vect = CountVectorizer()

In [624]:
features_train = train['tweet']
labels_train = train['community']
X_train_counts = count_vect.fit_transform(features_train)

In [625]:
clf = MultinomialNB().fit(X_train_counts, labels_train)
clf.score(X_train_counts, labels_train)

1.0

In [626]:
X_new_counts = count_vect.transform(test['tweet'])

#do the predictions
preds = clf.predict(X_new_counts)

In [627]:
test['community'] = preds
train['community'] = clf.predict(X_train_counts)
final = test.append(train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [628]:
results_map = {}
results_map[0] = []
results_map[1] = []

for index, row in final.iterrows():
    results_map[row['community']].append(row['user'])

with open("task2_C.txt", "w", encoding="utf-8") as f:
    for com in results_map:
        result = results_map[com]
        push = sorted(result)
        f.write("%s\n" % str(push)[1:-1].replace(" ", ""))

In [629]:
real = {}
real[0] = []
real[1] = []
with open("toy_test/task2_C.txt", "r", encoding="utf-8") as f:
    ct = 0
    for line in f.readlines():
        word = ''
        for l in line:
            if l == ',':
                real[ct].append(word[1:-1])
                word = ''
            else:
                word+=l
               
        ct+=1
pred = {}
pred[0] = []
pred[1] = []
with open("task2_C.txt", "r", encoding="utf-8") as f:
    ct = 0
    for line in f.readlines():
        word = ''
        for l in line:
            if l == ',':
                pred[ct].append(word[1:-1])
                word = ''
            else:
                word+=l
               
        ct+=1

In [630]:
for p in pred:
    print(len(pred[p]))

152
1029


In [631]:
for r in real:
    print(len(real[r]))

152
1029
