In [1]:
import pandas as pd

In [2]:
# Datei mit Twitterdaten
input_file = 'all_tweets.csv'

In [3]:
# Liest die CSV-Datei und erstellt einen DataFrame
tweets_df = pd.read_csv(input_file)

In [4]:
# Schneller Blick in die Daten
tweets_df

Unnamed: 0,hashtag,date,tweet_id,content,username,retweets,likes,replies
0,#MLF23,2023-04-15 10:52:06+00:00,1647191059431038977,Som els únics que tenim aquest temaç en bucle?...,MallorcaLiveFes,0,3,0
1,#MLF23,2023-04-13 16:00:23+00:00,1646543865623597058,Un petit recap de la que va ser una tarda plen...,MallorcaLiveFes,0,4,0
2,#MLF23,2023-04-13 11:49:30+00:00,1646480727704322049,Aquí ja olora a estiu i estiu vol dir #MLF23 ...,MallorcaLiveFes,0,6,1
3,#MLF23,2023-04-12 10:20:12+00:00,1646095869257580544,¿Te imaginas empezar el fin de semana así? 💥 A...,MallorcaLiveFes,2,7,0
4,#MLF23,2023-04-12 10:10:11+00:00,1646093348833312768,"Es viernes y el cuerpo lo sabe, porque con est...",MallorcaLiveFes,1,7,0
...,...,...,...,...,...,...,...,...
628,#ImolaGP,2023-04-02 15:47:24+00:00,1642554332036866051,Real or Game? Fernando Alonso Driver Eye Camer...,TSGameSpot3,0,1,0
629,#ImolaGP,2023-04-02 05:52:05+00:00,1642404516359184384,WE NEED #ImolaGP NOW MERCEDES FANS,ItsMeMilly305,0,0,0
630,#ImolaGP,2023-04-01 10:56:07+00:00,1642118639536013313,"Ciao, sabato 20/05 andrò a #ImolaGP, c'è qualc...",acasatuttobene_,0,0,0
631,#ImolaGP,2023-03-31 05:45:48+00:00,1641678157265682432,La nueva goma wet de @pirellisport debutará ha...,pasotti_,0,1,0


In [5]:
# Anzahl der fehlenden Werte (NA) pro Spalte berechnen und anzeigen lassen
missing_values = tweets_df.isna().sum()
print(missing_values)

hashtag     0
date        0
tweet_id    0
content     0
username    0
retweets    0
likes       0
replies     0
dtype: int64


In [6]:
# Keine NA (höchstens 0, da nicht richtig gezogen), jedoch zeigt ein kurzer Blick in die csv-Datei, dass i.O.

In [7]:
# Unbrauchbare Spalten entfernen
columns_to_remove = ['tweet_id', 'content', 'username']
tweets_df = tweets_df.drop(columns=columns_to_remove)

In [8]:
# Aktualisierter Datensatz
tweets_df

Unnamed: 0,hashtag,date,retweets,likes,replies
0,#MLF23,2023-04-15 10:52:06+00:00,0,3,0
1,#MLF23,2023-04-13 16:00:23+00:00,0,4,0
2,#MLF23,2023-04-13 11:49:30+00:00,0,6,1
3,#MLF23,2023-04-12 10:20:12+00:00,2,7,0
4,#MLF23,2023-04-12 10:10:11+00:00,1,7,0
...,...,...,...,...,...
628,#ImolaGP,2023-04-02 15:47:24+00:00,0,1,0
629,#ImolaGP,2023-04-02 05:52:05+00:00,0,0,0
630,#ImolaGP,2023-04-01 10:56:07+00:00,0,0,0
631,#ImolaGP,2023-03-31 05:45:48+00:00,0,1,0


In [9]:
# Entfernt das '#' aus der 'hashtag'-Spalte
tweets_df['hashtag'] = tweets_df['hashtag'].str.replace('#', '')

In [10]:
# Uhrzeit von der Spalte "date" entfernen
# Spalte 'date' in ein datetime-Objekt umwandeln
tweets_df['date'] = pd.to_datetime(tweets_df['date'])

# Nur das Datumsteil extrahieren und in der Spalte 'date' speichern
tweets_df['date'] = tweets_df['date'].dt.date

In [11]:
# DataFrame nach 'hashtag' und 'date' sortieren
tweets_df = tweets_df.sort_values(['hashtag', 'date'])

# Kumulative Summe für 'retweets', 'likes' und 'replies' berechnen und im DF speichern
tweets_df[['retweets_cum', 'likes_cum', 'replies_cum']] = tweets_df.groupby('hashtag')[['retweets', 'likes', 'replies']].cumsum()

In [12]:
# Speichert den aggregierten DataFrame in einer neuen CSV-Datei
tweets_df.to_csv('tweets_cleaned_aggr.csv', index=False, encoding='utf-8-sig', sep=',')

In [13]:
# Blick in den aggregierten DF
tweets_df

Unnamed: 0,hashtag,date,retweets,likes,replies,retweets_cum,likes_cum,replies_cum
29,BBF23,2023-03-30,17,40,0,17,40,0
28,BBF23,2023-03-31,2,6,0,19,46,0
27,BBF23,2023-04-04,0,1,0,19,47,0
24,BBF23,2023-04-05,0,0,0,19,47,0
25,BBF23,2023-04-05,0,0,0,19,47,0
...,...,...,...,...,...,...,...,...
83,kingsday,2023-04-14,3,29,0,20,122,12
76,kingsday,2023-04-15,0,1,0,20,123,12
77,kingsday,2023-04-15,0,0,1,20,123,13
78,kingsday,2023-04-15,0,0,0,20,123,13
