In [1]:
import pandas as pd
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import geopandas as gpd
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [16,9]
import numpy as np
import json
from shapely.geometry import Point, Polygon
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm

In [2]:
bunches = []

for bunch in range(50):
    
    with open('./Twitter Bunches/' + str(bunch) + '_de_bunch.json', 'r', encoding='utf-8') as f:
    
        D_read = json.load(f)
        bunches.extend(D_read) # extend instead of append
        
tweet_ids = []

for tweet in bunches:
    
    tweet_ids.append(tweet['id'])
    
len(np.unique(np.array(tweet_ids))), len(bunches)

unique = { tweet['id'] : tweet for tweet in bunches } # a dictionary comprehension, dicts do not allow duplicate keys

matches = []

for (key, value) in unique.items():
    
    newDict = dict()
    
    try:
        
        if value['place']['bounding_box']['coordinates'] != None:
            
            newDict[key] = value
            
            matches.append(newDict)
    except:
        
        None

In [3]:
results_list = []

for i in matches:
    
    empty_dict = {}
    
    empty_dict['Tweet ID'] = list(i.keys())[0]
    
    values_list = list(i.values())[0]
    
    empty_dict['Created at'] = pd.to_datetime(values_list['created_at'])
    empty_dict['Full Text'] = values_list['full_text']
    empty_dict['User Name'] = values_list['user']['name']
    empty_dict['User Alias'] = values_list['user']['screen_name']
    empty_dict['Place ID'] = values_list['place']['id']
    empty_dict['Place Name'] = values_list['place']['name']
    empty_dict['Country Code'] = values_list['place']['country_code']
    empty_dict['Country Name'] = values_list['place']['country']
    empty_dict['Bounding Box'] = values_list['place']['bounding_box']['coordinates'][0]
    
    
    bbox = Polygon(values_list['place']['bounding_box']['coordinates'][0])
    
    latitude = bbox.centroid.xy[1][0]
    longitude = bbox.centroid.xy[0][0]
    
    coord = Point(longitude, latitude)
    
    empty_dict['Tweet Coordinates'] = coord
    empty_dict['Retweet Count'] = values_list['retweet_count']
    empty_dict['Favorite Count'] = values_list['favorite_count']
     
    empty_dict['User Location'] = values_list['user']['location']
    empty_dict['User Followers'] = values_list['user']['followers_count']
    empty_dict['User Friends'] = values_list['user']['friends_count']
    empty_dict['Registration Date'] = values_list['user']['created_at']
    
    tweet_url = 'https://twitter.com/{}/status/{}'.format(values_list['user']['screen_name'], values_list['id'])
    empty_dict['Tweet URL'] = tweet_url
    
    results_list.append(empty_dict)

In [5]:
results_df = pd.DataFrame(results_list)
gdf = gpd.GeoDataFrame(results_df, geometry=results_df['Tweet Coordinates'])
gdf = gdf.set_crs(epsg=4326)

In [10]:
coordinates_data = {'lon': gdf['geometry'].x, 'lat': gdf['geometry'].y}
coordinates_df = pd.DataFrame(data=coordinates_data)

ssd = []

for i in range(2, 20):
    # Find clusters
    km = MiniBatchKMeans(n_clusters=i)
    km.fit_predict(coordinates_df)
    
    # Label cluster centers
    centers = km.cluster_centers_
    
    # Calculate sum of squared distances
    ssd.append(km.inertia_)
    
    # Get cluster center
    coordinates_df['cluster'] = km.labels_

del coordinates_df['cluster']

range_n_clusters = range(2, 20)

X = np.array(coordinates_df)

for n_clusters in range_n_clusters:
    
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    
    cluster_labels = clusterer.fit_predict(X)
    
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)
    
clusterer = KMeans(n_clusters=11, random_state=10)
cluster_labels = clusterer.fit_predict(X)
centers = clusterer.cluster_centers_

gdf['Cluster Label'] = cluster_labels
centers_gdf = gpd.GeoDataFrame(centers, geometry=[Point(point) for point in centers])

import seaborn as sns
palette = sns.color_palette(None, 11).as_hex()

For n_clusters = 2 The average silhouette_score is : 0.5026467502002958
For n_clusters = 3 The average silhouette_score is : 0.5132745593074737
For n_clusters = 4 The average silhouette_score is : 0.566164215980916
For n_clusters = 5 The average silhouette_score is : 0.591191320195869
For n_clusters = 6 The average silhouette_score is : 0.5907779746624445
For n_clusters = 7 The average silhouette_score is : 0.5801221292146603
For n_clusters = 8 The average silhouette_score is : 0.574197565552152
For n_clusters = 9 The average silhouette_score is : 0.5913547539887604
For n_clusters = 10 The average silhouette_score is : 0.5456751570781648
For n_clusters = 11 The average silhouette_score is : 0.5950965589947061
For n_clusters = 12 The average silhouette_score is : 0.5628158455403909
For n_clusters = 13 The average silhouette_score is : 0.5572209434925244
For n_clusters = 14 The average silhouette_score is : 0.5677075306350605
For n_clusters = 15 The average silhouette_score is : 0.572056

In [11]:
mapbox_access_token = open("mapbox_token.txt").read()

colors = palette

traces = []

for cluster_num in set(cluster_labels):
    
    sub_df = gdf[gdf['Cluster Label'] == cluster_num]
    
    trace = go.Scattermapbox(
    #lat = sub_df['geometry'].x.values,
    #lon = sub_df['geometry'].y.values,
    lon = sub_df['geometry'].x.values,
    lat = sub_df['geometry'].y.values,
    mode = 'markers',
    marker = go.scattermapbox.Marker(
    size = 5,
    color= colors[cluster_num],
    #symbol = 'star'
    ),
    text = sub_df['Place Name'] + "<br>" +
        "Cluster ID: " + str(cluster_num)
    )
    
    traces.append(trace)
    
cluster_center_trace = go.Scattermapbox(
#     lat = centers_gdf['geometry'].x.values,
#     lon = centers_gdf['geometry'].y.values,
    lon = centers_gdf['geometry'].x.values,
    lat = centers_gdf['geometry'].y.values,
    mode = 'markers',
    marker = go.scattermapbox.Marker(
    size = 7,
    color='red',
    #symbol = 'star'
    ),
    text = list(range(7))
    )

traces.append(cluster_center_trace)
    
fig = go.Figure(data = traces)

layout = go.Layout(
    autosize=True,
    hovermode='closest',
    width = 800, 
    height = 800,
    mapbox=go.layout.Mapbox(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=go.layout.mapbox.Center(
            lat=51,
            lon=10
        ),
        pitch=0,
        zoom=5
    ),
)

fig.layout.update(layout)
                  
iplot(fig, filename="geomap_twitter_cluster")

In [12]:
germany_borders = gpd.read_file("https://opendata.arcgis.com/datasets/b8d0cc7735774bed8e6df1c5410394a4_0.geojson")
sub_gdf = gpd.GeoDataFrame(germany_borders.loc[0]).T
sub_gdf['geometry'] = germany_borders.loc[0]['geometry']
sub_gdf = sub_gdf.set_crs(epsg = 4326)
gdf.crs == sub_gdf.crs

True

In [15]:
# Cannot have multiple geometry-like columns!
del gdf['Bounding Box']
del gdf['Tweet Coordinates']

In [21]:
# https://stackoverflow.com/questions/64200595/geopandas-overlay-intersection-returns-zero-rows THANK YOU!

# Multiple solutions: https://gis.stackexchange.com/questions/208546/check-if-a-point-falls-within-a-multipolygon-with-python

from geopandas.tools import overlay
pointInPolys = overlay(gdf, sub_gdf, how="intersection")

In [24]:
pointInPolys

Unnamed: 0,Tweet ID,Created at,Full Text,User Name,User Alias,Place ID,Place Name,Country Code,Country Name,Retweet Count,...,SN_V1,SN_V2,SN_G,FK_S3,NUTS,EWZ,KFL,SHAPE_Length,SHAPE_Area,geometry
0,1381164372362428417,2021-04-11 08:37:01+00:00,Ich war nicht immer mit der Politik von Ex-Kan...,Udo Maaß 🔴🔴🔴,UdoMaass,ad2257a47d7f5ce8,Neuenkirchen,DE,Germany,0,...,00,00,000,0,DEF,2881926,15802.28,20.924937,2.159523,POINT (9.00064 54.23302)
1,1381163805258883072,2021-04-11 08:34:46+00:00,"Stelle mir gerade vor, dass die #Ministerpraes...",Udo Maaß 🔴🔴🔴,UdoMaass,ad2257a47d7f5ce8,Neuenkirchen,DE,Germany,0,...,00,00,000,0,DEF,2881926,15802.28,20.924937,2.159523,POINT (9.00064 54.23302)
2,1381162150618611713,2021-04-11 08:28:12+00:00,@trolkragnpulova @anwaltsgelaber Er behauptet ...,mArk,ExilSchorfheide,0057033fb8a3c9b3,Wattenbek,DE,Germany,0,...,00,00,000,0,DEF,2881926,15802.28,20.924937,2.159523,POINT (10.04702 54.15533)
3,1381161886394236930,2021-04-11 08:27:09+00:00,@CyberT3rrorizt Stimmt,Simone Peuss,PeussSimone,f42e88f63a54ebb3,Osterrönfeld,DE,Germany,0,...,00,00,000,0,DEF,2881926,15802.28,20.924937,2.159523,POINT (9.70901 54.27548)
4,1381160483110191104,2021-04-11 08:21:34+00:00,@Propylaeen Ja ja ja ja. (Obwohl: hier gerade ...,Wolfgang Lünenbürger (he/him/his) ¯\_(ツ)_/¯,luebue,26eb2479c4caf280,Bosau,DE,Germany,0,...,00,00,000,0,DEF,2881926,15802.28,20.924937,2.159523,POINT (10.50839 54.09566)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,1380826215267897344,2021-04-10 10:13:18+00:00,"Mino langweilt sich etwas im DUCKDALBEN, weil ...",Deutsche Seemannsmission @Nordkirche,DSM_Nordkirche,d2de95ce61028811,Pinneberg,DE,Germany,2,...,00,00,000,0,DEF,2881926,15802.28,20.924937,2.159523,POINT (9.79640 53.64639)
316,1380825962716217346,2021-04-10 10:12:18+00:00,"@gamdoo ""Haribo macht Kinder fett steht sogar ...",benezans,benezans,d2de95ce61028811,Pinneberg,DE,Germany,0,...,00,00,000,0,DEF,2881926,15802.28,20.924937,2.159523,POINT (9.79640 53.64639)
317,1380824847333388297,2021-04-10 10:07:52+00:00,@vfbluebeck @Rote_Teufel Wir die WAHREN Fans d...,Big Jay,BigJay79971891,73a3419e2342ce82,Ahrensburg,DE,Germany,0,...,00,00,000,0,DEF,2881926,15802.28,20.924937,2.159523,POINT (10.24214 53.66993)
318,1380823426865184773,2021-04-10 10:02:14+00:00,@rune737 @infoluencer Da hat der Text von Amth...,Franziska Kapteina,Conny00X,1b9b5e83e647a7ed,Kiel,DE,Germany,0,...,00,00,000,0,DEF,2881926,15802.28,20.924937,2.159523,POINT (10.12575 54.34180)
