In [1]:
import pandas as pd

In [2]:
# Datei mit Twitterdaten
input_file = 'all_tweets.csv'

In [3]:
# Liest die CSV-Datei und erstellt einen DataFrame
tweets_df = pd.read_csv(input_file)

In [4]:
# Schneller Blick in die Daten
tweets_df

Unnamed: 0,hashtag,date,tweet_id,content,username,retweets,likes,replies
0,#MLF23,2023-04-15 10:52:06+00:00,1647191059431038977,Som els √∫nics que tenim aquest tema√ß en bucle?...,MallorcaLiveFes,0,3,0
1,#MLF23,2023-04-13 16:00:23+00:00,1646543865623597058,Un petit recap de la que va ser una tarda plen...,MallorcaLiveFes,0,4,0
2,#MLF23,2023-04-13 11:49:30+00:00,1646480727704322049,Aqu√≠ ja olora a estiu i estiu vol dir #MLF23 ...,MallorcaLiveFes,0,6,1
3,#MLF23,2023-04-12 10:20:12+00:00,1646095869257580544,¬øTe imaginas empezar el fin de semana as√≠? üí• A...,MallorcaLiveFes,2,7,0
4,#MLF23,2023-04-12 10:10:11+00:00,1646093348833312768,"Es viernes y el cuerpo lo sabe, porque con est...",MallorcaLiveFes,1,7,0
...,...,...,...,...,...,...,...,...
628,#ImolaGP,2023-04-02 15:47:24+00:00,1642554332036866051,Real or Game? Fernando Alonso Driver Eye Camer...,TSGameSpot3,0,1,0
629,#ImolaGP,2023-04-02 05:52:05+00:00,1642404516359184384,WE NEED #ImolaGP NOW MERCEDES FANS,ItsMeMilly305,0,0,0
630,#ImolaGP,2023-04-01 10:56:07+00:00,1642118639536013313,"Ciao, sabato 20/05 andr√≤ a #ImolaGP, c'√® qualc...",acasatuttobene_,0,0,0
631,#ImolaGP,2023-03-31 05:45:48+00:00,1641678157265682432,La nueva goma wet de @pirellisport debutar√° ha...,pasotti_,0,1,0


In [5]:
# Anzahl der fehlenden Werte (NA) pro Spalte berechnen und anzeigen lassen
missing_values = tweets_df.isna().sum()
print(missing_values)

hashtag     0
date        0
tweet_id    0
content     0
username    0
retweets    0
likes       0
replies     0
dtype: int64


In [6]:
# Keine NA (h√∂chstens 0, da nicht richtig gezogen), jedoch zeigt ein kurzer Blick in die csv-Datei, dass i.O.

In [7]:
# Unbrauchbare Spalten entfernen
columns_to_remove = ['tweet_id', 'content', 'username']
tweets_df = tweets_df.drop(columns=columns_to_remove)

In [8]:
# Aktualisierter Datensatz
tweets_df

Unnamed: 0,hashtag,date,retweets,likes,replies
0,#MLF23,2023-04-15 10:52:06+00:00,0,3,0
1,#MLF23,2023-04-13 16:00:23+00:00,0,4,0
2,#MLF23,2023-04-13 11:49:30+00:00,0,6,1
3,#MLF23,2023-04-12 10:20:12+00:00,2,7,0
4,#MLF23,2023-04-12 10:10:11+00:00,1,7,0
...,...,...,...,...,...
628,#ImolaGP,2023-04-02 15:47:24+00:00,0,1,0
629,#ImolaGP,2023-04-02 05:52:05+00:00,0,0,0
630,#ImolaGP,2023-04-01 10:56:07+00:00,0,0,0
631,#ImolaGP,2023-03-31 05:45:48+00:00,0,1,0


In [9]:
# Entfernt das '#' aus der 'hashtag'-Spalte
tweets_df['hashtag'] = tweets_df['hashtag'].str.replace('#', '')

In [10]:
# Neuen DF erstllen, der zu jedem Event, die Anzahl an tweets, retweets, likes und replies aufsummiert
aggregated_df = tweets_df.groupby('hashtag').agg(
    event_name=('hashtag', 'first'),
    tweets=('hashtag', 'count'),
    retweets=('retweets', 'sum'),
    likes=('likes', 'sum'),
    replies=('replies', 'sum')
).reset_index(drop=True)

In [11]:
# Blick in den aggregierten DF zeigt, dass 3 Events aufgrund fehlender Tweets fehlen. Da es kein so grosser Aufwand ist, werden diese manuell eingef√ºgt
aggregated_df

Unnamed: 0,event_name,tweets,retweets,likes,replies
0,BBF23,12,27,71,0
1,FeriadeAbril,303,977,2770,154
2,FoodiesFestival,1,1,4,0
3,Fr√ºhlingsfest,45,9,106,15
4,ImolaGP,67,189,2637,85
5,MLF23,18,8,68,1
6,MonacoGP,129,992,11436,237
7,NS20ans,1,1,3,0
8,apriljazz,4,2,12,0
9,bloemencorso,15,2,5,0


In [12]:
# Einf√ºgen der 3 Events mit 0-Werten
additional_rows = [
    {'event_name': 'lbjw23', 'tweets': 0, 'retweets': 0, 'likes': 0, 'replies': 0},
    {'event_name': 'Karnevalderkulturen', 'tweets': 0, 'retweets': 0, 'likes': 0, 'replies': 0},
    {'event_name': 'H√§ndelFestival', 'tweets': 0, 'retweets': 0, 'likes': 0, 'replies': 0}
]

for row in additional_rows:
    aggregated_df = aggregated_df.append(row, ignore_index=True)

  aggregated_df = aggregated_df.append(row, ignore_index=True)
  aggregated_df = aggregated_df.append(row, ignore_index=True)
  aggregated_df = aggregated_df.append(row, ignore_index=True)


In [13]:
# Speichert den aggregierten DataFrame in einer neuen CSV-Datei
aggregated_df.to_csv('tweets_cleaned.csv', index=False, encoding='utf-8-sig', sep=',')

In [14]:
# Blick in den aggregierten DF
aggregated_df

Unnamed: 0,event_name,tweets,retweets,likes,replies
0,BBF23,12,27,71,0
1,FeriadeAbril,303,977,2770,154
2,FoodiesFestival,1,1,4,0
3,Fr√ºhlingsfest,45,9,106,15
4,ImolaGP,67,189,2637,85
5,MLF23,18,8,68,1
6,MonacoGP,129,992,11436,237
7,NS20ans,1,1,3,0
8,apriljazz,4,2,12,0
9,bloemencorso,15,2,5,0
