In [1]:
%matplotlib inline
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as plt_ex

def countMissingValues(df, attr):
    count = df[attr].isna().sum()
    print(count, "missing", attr, "values")
    return count

In [2]:
datasetTweetsCSVPath = "../dataset/tweets.csv"

In [42]:
destCleanedDatasetPath = "../dataset/clean_datatype_tweets.csv"

In [3]:
df_tweets = pd.read_csv(datasetTweetsCSVPath, sep=',', index_col=0)

In [4]:
df_tweets.shape

(13664696, 9)

In [5]:
df_tweets.text = df_tweets.text.apply(lambda x:x.lower().strip() if type(x) == str else x)

In [6]:
df_tweets = df_tweets.drop_duplicates(keep='first')

In [7]:
df_tweets.shape

(11712091, 9)

In [8]:
df_tweets[df_tweets.text == ""]

Unnamed: 0_level_0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1197679862,19300510,0,0,0,0,0,0,2014-02-13 01:59:29,


In [9]:
df_tweets.loc[df_tweets.text == "", "text"] = np.nan

In [10]:
df_tweets.loc["1197679862"]

user_id                      19300510
retweet_count                       0
reply_count                         0
favorite_count                      0
num_hashtags                        0
num_urls                            0
num_mentions                        0
created_at        2014-02-13 01:59:29
text                              NaN
Name: 1197679862, dtype: object

In [11]:
df_tweets.text.isna().sum()

530032

In [44]:
df_tweets.index.value_counts()

dmy                   14
qsa                   12
jgm                   11
nis                   11
enl                   11
                      ..
472403921332613120     1
583261659797426176     1
361607898599657473     1
582869903788634113     1
486479812614451200     1
Name: id, Length: 11671629, dtype: int64

In [12]:
def IsInteger(num):
    try:
        val = float(num)
        return float.is_integer(val)
    except ValueError:
        return False

def countIntegers(df, attr):
    numbers = df_tweets[attr].apply(IsInteger).sum()
    print("Number of not NaN integers", numbers)
    
def object_to_numeric(df, attr):
    print("Type before conversion:", df[attr].dtype)
    old_missingval = df[attr].isna().sum()
    print("Missing values before conversion:", old_missingval)
    # coerces non-numeric junk to NaNs
    df[attr] = pd.to_numeric(df[attr], errors='coerce', downcast='integer')
    #df[attr].fillna(-1).astype(np.int64)
    new_missingval = df[attr].isna().sum()
    print("Missing values after conversion:", new_missingval)
    print("Type after conversion:", df[attr].dtype)
    print("Min:", df[attr].min(), "\tMax:", df[attr].max())

In [13]:
countIntegers(df_tweets, "user_id")

Number of not NaN integers 11278203


In [14]:
object_to_numeric(df_tweets, "user_id")

Type before conversion: object
Missing values before conversion: 217276
Missing values after conversion: 433888
Type after conversion: float64
Min: 0.0 	Max: 2.9999999999999998e+296


In [15]:
countIntegers(df_tweets, "retweet_count")

Number of not NaN integers 11086634


In [16]:
object_to_numeric(df_tweets, "retweet_count")

Type before conversion: object
Missing values before conversion: 437134
Missing values after conversion: 625456
Type after conversion: float64
Min: 0.0 	Max: inf


In [17]:
df_tweets[df_tweets.retweet_count == np.inf]

Unnamed: 0_level_0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
25722144302443379,201350011.0,inf,,,,,,2018-05-05 15:15:05,@enlajugadarcn @casaleantonio @jorgebermudezh ...


In [18]:
df_tweets.loc[df_tweets.retweet_count == np.inf] = np.nan

In [19]:
df_tweets[df_tweets.retweet_count == np.inf]

Unnamed: 0_level_0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [20]:
countIntegers(df_tweets, "reply_count")

Number of not NaN integers 10902591


In [21]:
object_to_numeric(df_tweets, "reply_count")

Type before conversion: object
Missing values before conversion: 647873
Missing values after conversion: 809499
Type after conversion: float64
Min: 0.0 	Max: inf


In [22]:
df_tweets[df_tweets.reply_count == np.inf]

Unnamed: 0_level_0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
732386427,466377491.0,0.0,inf,1.0,0.0,,0,2019-08-29 13:04:45,prima di andare via sorridi un po' ma comunque...


In [23]:
df_tweets.loc[df_tweets.reply_count == np.inf] = np.nan

In [24]:
df_tweets[df_tweets.reply_count == np.inf]

Unnamed: 0_level_0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [25]:
countIntegers(df_tweets, "favorite_count")

Number of not NaN integers 10903994


In [26]:
object_to_numeric(df_tweets, "favorite_count")

Type before conversion: object
Missing values before conversion: 647542
Missing values after conversion: 808097
Type after conversion: float64
Min: -1.0 	Max: 7e+211


In [27]:
df_tweets[df_tweets.favorite_count == -1]

Unnamed: 0_level_0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
521392611366797312,466480215.0,1.0,0.0,-1.0,0,0,0,2019-10-14 20:11:00,studiare studiar studi stud stu st sto stoc st...


In [28]:
df_tweets.loc[df_tweets.favorite_count == -1] = np.nan

In [29]:
df_tweets[df_tweets.favorite_count == -1]

Unnamed: 0_level_0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [30]:
countIntegers(df_tweets, "num_hashtags")

Number of not NaN integers 10548428


In [31]:
object_to_numeric(df_tweets, "num_hashtags")

Type before conversion: object
Missing values before conversion: 1057510
Missing values after conversion: 1163663
Type after conversion: float64
Min: 0.0 	Max: 7e+153


In [32]:
countIntegers(df_tweets, "num_urls")

Number of not NaN integers 10903149


In [33]:
object_to_numeric(df_tweets, "num_urls")

Type before conversion: object
Missing values before conversion: 648612
Missing values after conversion: 808942
Type after conversion: float64
Min: 0.0 	Max: 1e+210


In [34]:
countIntegers(df_tweets, "num_mentions")

Number of not NaN integers 10724115


In [35]:
object_to_numeric(df_tweets, "num_mentions")

Type before conversion: object
Missing values before conversion: 854151
Missing values after conversion: 987976
Type after conversion: float64
Min: 0.0 	Max: 6.0000000000000004e+97


In [36]:
df_tweets.created_at = pd.to_datetime(df_tweets.created_at, format='%Y-%m-%d %H:%M:%S')
print("Min:", df_tweets.created_at.min(), "\tMax:", df_tweets.created_at.max())

Min: 1953-04-17 12:24:33 	Max: 2040-04-27 11:03:25


In [37]:
invalid_created_at = df_tweets.loc[(df_tweets["created_at"] < '2012-01-01') | (df_tweets["created_at"] > '2020-12-31')]
invalid_created_at

Unnamed: 0_level_0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
337410935437272894,1.482442e+07,0.0,0.0,0.0,1.0,0.0,1.0,1960-04-26 18:05:56,@arvixesupport shared ssl stopped working on a...
241606582771852729,1.480228e+08,0.0,0.0,0.0,0.0,0.0,0.0,2038-06-22 08:28:55,faqat nabz se haal zaahir na hoga\nmera dil bh...
20477344093836797,1.462660e+07,0.0,0.0,0.0,0.0,0.0,1.0,1958-03-15 16:54:57,@nancyjensen harry &amp; dwntwn 95% clear of s...
890932030,1.521942e+08,0.0,0.0,0.0,0.0,0.0,1.0,2040-02-29 13:20:29,@shuinkou ......./hopes senpie stay healthy......
814741272377044956,1.852826e+09,0.0,0.0,0.0,1.0,0.0,0.0,1959-05-02 23:16:40,her tl is sooo dead omgg #herbff
...,...,...,...,...,...,...,...,...,...
108516312,1.494211e+08,1.0,0.0,0.0,0.0,0.0,2.0,2040-04-20 01:24:22,@yungtrello @rekofromthe5 yes you are
81718936,5.054330e+08,0.0,0.0,0.0,0.0,0.0,0.0,1960-03-17 01:36:18,i hate when people eat and they talk. then the...
119550211118647,1.858934e+09,1730.0,0.0,0.0,0.0,0.0,1.0,1960-03-18 12:53:14,rt @chiiiout: i want to do so much more in thi...
924040103,1.616349e+08,4.0,0.0,2.0,0.0,0.0,0.0,1960-05-16 15:11:04,11:11 for mary jane veloso's life to be comple...


In [38]:
df_tweets.loc[invalid_created_at.index, "created_at"] = np.nan

In [39]:
df_tweets.created_at.isna().sum()

97634

In [40]:
print("Min:", df_tweets.created_at.min(), "\tMax:", df_tweets.created_at.max())

Min: 2012-03-11 22:25:59 	Max: 2020-05-03 10:36:12


In [41]:
df_tweets.dtypes

user_id                  float64
retweet_count            float64
reply_count              float64
favorite_count           float64
num_hashtags             float64
num_urls                 float64
num_mentions             float64
created_at        datetime64[ns]
text                      object
dtype: object

In [45]:
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11712091 entries, 509354017856950272 to 486479812614451200
Data columns (total 9 columns):
 #   Column          Dtype         
---  ------          -----         
 0   user_id         float64       
 1   retweet_count   float64       
 2   reply_count     float64       
 3   favorite_count  float64       
 4   num_hashtags    float64       
 5   num_urls        float64       
 6   num_mentions    float64       
 7   created_at      datetime64[ns]
 8   text            object        
dtypes: datetime64[ns](1), float64(7), object(1)
memory usage: 1.1+ GB


# Save on file

In [43]:
df_tweets.to_csv(destCleanedDatasetPath)