In [1]:
from bertopic import BERTopic
import pandas as pd
import nltk
from nltk.corpus import stopwords

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
# Read the data and perform preprocessing

df = pd.read_csv("data/articles_handlabeled_v2_cleaned.csv", parse_dates=["date"]) # Read data into 'df' dataframe
df = df[['summary', 'date', 'chat_GPT_location', 'lat',	'lng', 'article_id', 'cd_district_name']]

df = df[df['date'].dt.year == 2014]
df = df[df['date'].dt.month >= 1]
print(df.shape) # Print dataframe shape

docs = df["summary"].tolist() # Create a list containing all article summaries

# df.head() # Show first 5 dataframe entries
df

(139, 7)


Unnamed: 0,summary,date,chat_GPT_location,lat,lng,article_id,cd_district_name
98,The article discusses the ongoing fighting between SPLA forces and the defected army division in...,2014-01-07,Bentiu,9.231487,29.800503,8828,Rubkona
99,The article discusses the recapture of Bentiu in Unity state by South Sudan government troops af...,2014-01-10,Bentiu,9.231487,29.800503,8869,Rubkona
100,The article discusses the ongoing conflict in South Sudan between the government soldiers and re...,2014-01-09,Bentiu,9.231487,29.800503,8886,Rubkona
101,"The article discusses the recapture of Bentiu by South Sudanese government forces, the looting o...",2014-01-10,Bentiu,9.231487,29.800503,8888,Rubkona
102,"The article discusses the ongoing conflict in South Sudan, particularly the SPLA's attempts to r...",2014-01-10,Bentiu,9.231487,29.800503,8912,Rubkona
...,...,...,...,...,...,...,...
232,The article discusses the United Nations Security Council's condemnation of renewed deadly clash...,2014-11-06,Bentiu,9.231487,29.800503,12692,Rubkona
233,The article discusses how Nuer communities sheltering in UN camps in South Sudan and neighbourin...,2014-12-03,Bentiu,9.231487,29.800503,12846,Rubkona
234,The article discusses a report issued by the Human Rights Division of the United Nations Mission...,2014-12-19,Bentiu,9.231487,29.800503,12918,Rubkona
235,The article discusses a UN report which states that South Sudanese opposition forces committed w...,2014-12-19,Bentiu,9.231487,29.800503,12936,Rubkona


In [15]:
nltk.download("stopwords")

# Tokenize and filter out stopwords for each summary
filtered_docs = []
for doc in docs:
    words = doc.split()
    filtered_words = [word for word in words if word.lower() not in stopwords.words("english")]
    filtered_doc = " ".join(filtered_words)
    filtered_docs.append(filtered_doc)

very_filtered_docs = []
for filtered_doc in filtered_docs:
    words = filtered_doc.split()
    words_to_remove = ["article", "discusses"]
    filtered_words = [word for word in words if word.lower() not in words_to_remove]
    very_filtered_doc = " ".join(filtered_words)
    very_filtered_docs.append(very_filtered_doc)
very_filtered_docs

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\20201708\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['ongoing fighting SPLA forces defected army division several villages north Bentiu Mayom County, South Sudan. traders Unity State caught war Kordofan Unity State. UN peacekeeping mission observed villages along road Mayom Junction Pariang appeared burnt looted. Severe shortages food, water, shelter reported. UN refugee agency UNHCR announced return personnel Yida Unity State provided UN peacekeepers. South Sudan Government declared State Emergency Unity Jonglei states.',
 'recapture Bentiu Unity state South Sudan government troops battle rebel forces. town areas South Sudan faced devastation month-long conflict began renegade soldiers attacked army headquarters Juba. South Sudanese government troops plan recapture capital Jonglei state next. Foreign companies pulled workers oil-producing Unity state due deteriorating security situation. Additionally, negotiations peace talks struggled make progress. 60,000 people currently sheltering U.N. bases throughout country 250,000 displaced fig

In [17]:
# Create and fit the BERTopic model
bertopic = BERTopic()
topics, _ = bertopic.fit_transform(very_filtered_docs)

# Get the frequency distribution of topics
topic_freq = bertopic.get_topic_freq()

# Print the 15 most frequent topics
top_15_topics = topic_freq.head(15)
# print(top_15_topics)

pd.set_option('display.max_colwidth', 100)  # You can adjust the value as needed

bertopic.get_topic_info().set_index('Topic')


Unnamed: 0_level_0,Count,Name,Representation,Representative_Docs
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,56,-1_south_sudan_state_sudanese,"[south, sudan, state, sudanese, bentiu, forces, government, unity, conflict, rebel]","[clashes South Sudanese government troops rebel forces Unity state, particularly areas Panhiany ..."
0,26,0_civilians_bentiu_sudan_south,"[civilians, bentiu, sudan, south, un, killed, killings, violence, people, unmiss]","[United Nations' condemnation targeted killings civilians based ethnicity nationality Bentiu, ca..."
1,19,1_oil_unity_government_state,"[oil, unity, government, state, control, rebels, south, bentiu, army, capital]","[ongoing fighting control Bentiu, capital Unity state South Sudan, army opposition forces. oppos..."
2,19,2_south_rebels_sudan_president,"[south, rebels, sudan, president, government, salva, kiir, peace, conflict, ongoing]","[ongoing fighting several states South Sudan, recent accusations UN rebels killed hundreds civil..."
3,19,3_south_sudan_violence_civilians,"[south, sudan, violence, civilians, peace, rights, un, human, hostilities, conflict]","[deteriorating human rights situation South Sudan, including increasing ethnic violence revenge ..."
