In [95]:
import pandas as pd
from math import radians, sin, cos, sqrt, atan2

In [96]:


# Loading article dataset
articles = pd.read_csv('data/articles_summary_cleaned.csv', parse_dates=['date'])
articles['article_id'] = range(0, len(articles))

# Loading district data with centroid info
df2 = pd.read_csv('data/food_crises_cleaned.csv', parse_dates=['date'])
centroids = df2[['district', 'centx', 'centy']].drop_duplicates().reset_index(drop=True)

display(centroids)

# Function to calculate Haversine distance between two coordinates
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    distance = 6371 * c  # Earth radius in kilometers
    return distance

# Iterate through articles and find the nearest district
articles['district_name'] = ""

# display(articles)
for index, article in articles.iterrows():
    min_distance = float('inf')
    for _, centroid in centroids.iterrows():
        dist = haversine(article['lat'], article['lng'], centroid['centy'], centroid['centx'])
        if dist < min_distance:
            min_distance = dist
            articles.at[index, 'district_name'] = centroid['district']

# display article id and district assigned to it
display(articles[['article_id', 'district_name']])


Unnamed: 0,district,centx,centy
0,Bor,32.00486,6.465644
1,Jur River,28.02439,7.592506
2,Aweil North,26.72969,9.334756
3,Morobo,30.83354,3.737241
4,Magwi,32.21030,3.910535
...,...,...,...
73,Fashoda,31.84424,9.980295
74,Manyo,32.37020,11.130980
75,Melut,32.56855,10.380960
76,Renk,32.95721,11.362640


Unnamed: 0,article_id,district_name
0,0,Juba
1,1,Abiemnhom
2,2,Fashoda
3,3,Twic East
4,4,Juba
...,...,...
18515,18515,Maiwut
18516,18516,Renk
18517,18517,Maiwut
18518,18518,Magwi


# Testing if Articles are correctly classified for Juba

In [97]:
juba_articles = articles[articles['district_name'] =='Juba']

In [98]:
juba_articles

Unnamed: 0,summary,date,location_article,lat,lng,article_id,district_name
0,The article discusses the passing of the new C...,2011-07-07,Juba,4.859363,31.57125,0,Juba
4,The article discusses the need for South Sudan...,2011-07-02,Juba,4.859363,31.57125,4,Juba
8,The article discusses the Elders' letter of co...,2011-07-07,Juba,4.859363,31.57125,8,Juba
10,The article discusses a mock parade conducted ...,2011-07-01,Juba,4.859363,31.57125,10,Juba
11,The article discusses the South Sudan governme...,2011-07-03,Juba,4.859363,31.57125,11,Juba
...,...,...,...,...,...,...,...
18497,The article discusses the joint ecumenical pra...,2023-02-04,Juba,4.859363,31.57125,18497,Juba
18499,The article discusses how increasing global in...,2023-02-12,Juba,4.859363,31.57125,18499,Juba
18502,The article discusses a closed-door meeting be...,2022-12-28,Juba,4.859363,31.57125,18502,Juba
18508,The article discusses South Sudan's first Inte...,2023-02-20,Radisson in Juba,4.859363,31.57125,18508,Juba


In [99]:
centroids[centroids['district'] == 'Juba']

Unnamed: 0,district,centx,centy
16,Juba,31.48342,4.717081


# Subsetting Articles for 3 desired Districts

In [100]:
articles_filtered = articles[(articles['district_name'] == 'Koch') | (articles['district_name'] == 'Gogrial East') | (articles['district_name'] == 'Rubkona')]
articles_filtered = articles_filtered.rename(columns={'location_article': 'chat_GPT_location', 'district_name': 'cd_district_name'}).reset_index(drop=True) # closest distance district name
articles_filtered

Unnamed: 0,summary,date,chat_GPT_location,lat,lng,article_id,cd_district_name
0,The article discusses the readiness of Guit co...,2011-07-07,Bentiu,9.231487,29.800503,88,Rubkona
1,The article discusses the celebration of South...,2011-07-09,Bentiu,9.231487,29.800503,103,Rubkona
2,The article discusses the congratulatory messa...,2011-07-11,Bentiu,9.231487,29.800503,221,Rubkona
3,The article discusses the killing of rebel lea...,2011-07-23,Koch county,8.684727,29.881520,299,Koch
4,The article discusses the killing of South Sud...,2011-07-23,Unity state,8.927721,29.788925,327,Koch
...,...,...,...,...,...,...,...
678,The article discusses the doubling of people a...,2022-10-12,Unity state,8.927721,29.788925,18403,Koch
679,The article discusses the impact of historic f...,2022-10-23,Bentiu,9.231487,29.800503,18407,Rubkona
680,The article discusses the intensification of a...,2022-09-26,Unity State,8.927721,29.788925,18427,Koch
681,The article discusses a UN report documenting ...,2022-09-06,Unity state,8.927721,29.788925,18439,Koch


In [101]:
articles_filtered.to_csv('data/articles_summary_cleaned_with_location.csv', index=False)