# COMMUNITY DETECTION AND GEOLOCATION 

## Load data 

In [None]:
import pandas as pd
df = pd.read_excel("datacraft_data_vaccination_5g_2021_05_11.xlsx", engine="openpyxl")
df.head()

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

## Geolocation 

### Get coordinates of places indicated in the data 

In [None]:
!pip install geopy --user

In [None]:
from geopy.geocoders import Nominatim

In [None]:
geocoder = Nominatim(user_agent="datacrafting_exploTweets")
adresse = "191 rue Saint-Jacques, Paris, France"
location = geocoder.geocode(adresse)
print((location.latitude, location.longitude))

In [1]:
df.place_name.head()

NameError: name 'df' is not defined

In [None]:
from tqdm import tqdm # df_media.media_url.value_counts()[df_media.media_url.value_counts()>1]
geocoder = Nominatim(user_agent="datacrafting_exploTweets")
coding_place = dict()
for place, count in tqdm(df.place_name.value_counts()[df.place_name.value_counts()>1].iteritems()):
    loc = geocoder.geocode(place)
    coding_place[place] = loc

In [None]:
coding_place;head()

In [None]:
def transfoPlaceGps(strPlace):
    if strPlace in coding_place:
        return (coding_place.get(strPlace).longitude, coding_place.get(strPlace).latitude) 
    else:
        return None

# df["position_gps"] = df.place_name.apply(lambda x: if not pd.isna(x) else None)

In [None]:
df["position_gps"] = df.place_name.apply(lambda x: transfoPlaceGps(x))

### Plot coordinates 

In [None]:
from matplotlib import pyplot as plt

from bokeh.models import GeoJSONDataSource
from bokeh.plotting import figure, show, output_notebook
import geopandas as gp

import bokeh.io

bokeh.io.reset_output()
bokeh.io.output_notebook()
#output_notebook()

## tiles, worldmap
world = gp.read_file(gp.datasets.get_path('naturalearth_lowres'))
geo_source = GeoJSONDataSource(geojson=world.to_json())

## setting up the figure
p = figure(title='World', tooltips=[('Country', '@name')],
           x_range=(-180, 180), y_range=(-90, 90), 
           x_axis_location=None, y_axis_location=None,
           plot_width=900, plot_height=500
          )
p.patches('xs', 'ys', fill_alpha=0.4, fill_color='grey', 
          line_color='black', line_width=0.5, source=geo_source
         )

## the graph: edgelist and node_positions
# edge_list = [(1,2), (1,5), (2,3), (2,4), (3,4)]
# pos = {1:(2, 47), 
#       2: (-75, 42),
#       3:(-117, 34),
#       4:(-122, 49),
#       5:(25, -28)}

# Now group these values together into a lists of x (longitude) and y (latitude)
x = [position.longitude for position in coding_place.values()]
y = [position.latitude for position in coding_place.values()]

#draw_edges:
# for e in edge_list:
#     p.line([pos[e[0]][0], pos[e[1]][0]], [pos[e[0]][1], pos[e[1]][1]], line_width=2)

# The scattered Node markers
p.circle(x, y, size=8, color='navy', alpha=1)

show(p)

## Community detection

In [None]:
import networkx as nx
import community

### Create network

#### Avec une fonction

In [None]:
def compute_cluster(df):
    """
    df: DataFrame with at least columns ['pseudo', 'retweeted_screen_name', 'tweet_date']
    
    """
    print('Build graph...')
    G = nx.DiGraph()
    df_for_graph = df[df.retweeted_screen_name.notna()][["pseudo", "retweeted_screen_name", "tweet_date"]]
    for i, j in df[df.retweeted_screen_name.notna()][["pseudo", "retweeted_screen_name", "tweet_date"]].iterrows():
        G.add_edge(j.pseudo, j.retweeted_screen_name, date=j.tweet_date)
    
    print('Find clusters...')
    Gc = nx.subgraph(G, max(nx.connected_components(nx.Graph(G)), key=len))
    comms = community.best_partition(nx.Graph(Gc))
    
    clust = []
    for i, pseudo in enumerate(df['pseudo']):
        if pseudo in comms.keys():
            clust.append(comms[pseudo])
        else:
            clust.append(-1)       # -1 if the node of the pseudo is not in the giant connected component
    
    df['cluster'] = clust

In [None]:
df['cluster']

#### Sans fonction

In [None]:
G = nx.DiGraph()
for i, j in df[df.retweeted_screen_name.notna()][["pseudo", "retweeted_screen_name", "tweet_date"]].iterrows():
    G.add_edge(j.pseudo, j.retweeted_screen_name, date=j.tweet_date)

In [None]:
G.number_of_nodes(), G.number_of_edges()

In [None]:
Gc = nx.subgraph(G, max(nx.connected_components(nx.Graph(G)), key=len))

In [None]:
Gc.number_of_nodes(), Gc.number_of_edges()

Identify communities

In [None]:
comms = community.best_partition(nx.Graph(Gc))

In [None]:
result_communautes = dict()
for k in comms.keys():
    if comms[k] in result_communautes:
        result_communautes[comms[k]].append(k)
    else:
        result_communautes[comms[k]] = [k]

In [None]:
for i in range(40):
    print(i, len(result_communautes[i]))

In [None]:
## Store the results: 
import json
f = open("communautes_composante_principale.json","w")
json.dump(comms, f)
f.close()

In [None]:
# Load the results: 
import json
f = open("communautes_composante_principale.json","r")
comms = json.load(f)
f.close()

In [None]:
#Get community of a user : 
comms["martineroy1959"]

In [None]:
#Get user from community :
df[df.pseudo.isin(result_communautes[4]) & df.retweeted_screen_name.isna() & df.place_name.notna()][["pseudo", "tweet", "tweet_date", "place_name"]]

### Geolocation of communities 

In [None]:
from matplotlib import pyplot as plt

from bokeh.models import GeoJSONDataSource
from bokeh.plotting import figure, show, output_notebook
import geopandas as gp

import bokeh.io


def geoPlotGroup(groupID):
    bokeh.io.reset_output()
    bokeh.io.output_notebook()
    #output_notebook()

    ## tiles, worldmap
    world = gp.read_file(gp.datasets.get_path('naturalearth_lowres'))
    geo_source = GeoJSONDataSource(geojson=world.to_json())

    ## setting up the figure
    p = figure(title='World', tooltips=[('Country', '@name')],
               x_range=(-180, 180), y_range=(-90, 90), 
               x_axis_location=None, y_axis_location=None,
               plot_width=900, plot_height=500
              )
    p.patches('xs', 'ys', fill_alpha=0.4, fill_color='grey', 
              line_color='black', line_width=0.5, source=geo_source
             )

    # Now group these values together into a lists of x (longitude) and y (latitude)
    temp_df = df[df.pseudo.isin(result_communautes[groupID]) & df.place_name.notna()][["pseudo", "tweet", "tweet_date", "place_name", "position_gps"]]
    
    x = [row.position_gps[0] for _,row in temp_df[temp_df.position_gps.notna()].iterrows()]
    y = [row.position_gps[1] for _,row in temp_df[temp_df.position_gps.notna()].iterrows()]
#     size = []

    #draw_edges:
    # for e in edge_list:
    #     p.line([pos[e[0]][0], pos[e[1]][0]], [pos[e[0]][1], pos[e[1]][1]], line_width=2)

    # The scattered Node markers
    p.circle(x, y, size=8, color='navy', alpha=1)

    show(p)

In [None]:
# Plot for user 12 : 
geoPlotGroup(12)

In [None]:
# Check the users of this cluster :  
df[df.pseudo.isin(result_communautes[12]) & df.retweeted_screen_name.isna()][["pseudo", "tweet", "tweet_date", "place_name"]]

In [None]:
from wordcloud import WordCloud
from matplotlib import pyplot as plt
from PIL import Image

In [None]:
#Check out words from this cluster 
groupID = 12
stopW = ["un", "une", "le", "la", "de", "des", "et", "rt", "https", "co", "est", "pas", "il", "elle"]
text = ".".join(df[df.pseudo.isin(result_communautes[groupID])].tweet_formatted.sample(500))
wordcloud = WordCloud(background_color = 'white', stopwords = stopW, max_words = 50).generate(text)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()