In [1]:
%matplotlib inline
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as plt_ex

def countMissingValues(df, attr):
    count = df[attr].isna().sum()
    print(count, "missing", attr, "values")
    return count

In [2]:
datasetTweetsCSVPath = "./dataset/tweets.csv"

In [3]:
df_tweets = pd.read_csv(datasetTweetsCSVPath, sep=',', index_col=0)

In [4]:
df_tweets.shape

(13664696, 9)

In [5]:
df_tweets = df_tweets.drop_duplicates(keep='first')

In [6]:
df_tweets.shape

(11712093, 9)

1. how many text = ""
2. missing text = ""
3. missing user_id = "" or -1
4. dropping bad rows (more than 4/7 missing attributes)

In [7]:
len(df_tweets[df_tweets.text == ""])

0

In [8]:
df_tweets.text.isna().sum()

530031

In [9]:
df_tweets.text.fillna("", inplace=True)
df_tweets.text.isna().sum()

0

In [10]:
len(df_tweets[df_tweets.user_id == -1])

0

In [11]:
df_tweets.user_id.isna().sum()

217276

In [12]:
df_tweets.user_id.fillna(-1, inplace=True)
df_tweets.user_id.isna().sum()

0

In [13]:
tweets_isna = df_tweets.isna()

In [14]:
tweets_isna.head()

Unnamed: 0_level_0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
509354017856950272,False,False,False,False,False,False,False,False,False
583002592529121280,False,False,False,False,False,False,False,False,False
461498835362013185,False,False,False,False,False,False,False,False,False
528808127366692864,False,False,False,False,False,False,False,False,False
575336690904006656,False,False,False,False,False,False,False,False,False


In [15]:
count_of_missing_val = tweets_isna.sum(axis = 1)

In [16]:
count_of_missing_val.max()

6

In [17]:
df_tweets[count_of_missing_val < 4]

Unnamed: 0_level_0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
509354017856950272,327746321,0,0,0,0,0,0,2019-09-11 14:53:55,"If man is a little lower than angels, then ang..."
583002592529121280,333722906,1,0,0,0,0,1,2020-04-01 20:27:04,"""@BestWSHHVids: how do you say these words wit..."
461498835362013185,2379755827,0,0,0,0,0,1,2019-05-02 13:34:31,@LOLatComedy awsome
528808127366692864,466226882,0,0,0,0,0,0,2019-11-04 07:17:37,Stephen Hawkins: i buchi neri non esistono se ...
575336690904006656,1355537995,114,0,0,1,0,1,2020-03-11 16:45:31,RT @tibbs_montris: So ready for Wednesday!
...,...,...,...,...,...,...,...,...,...
329829994169786368,220933018,0,0,0,0,0,0,2018-05-04 05:29:33,ESTA MANANA AUN ESTA MUY FRIO ! MIREN ESTO ! ...
588535254207467520,587491046,0,0,0,0,0,1,2020-04-17 02:51:53,"@warriors Congrats, maybe I'll be able to get ..."
718157017970961,91781300,0,,l25suv5,,0.0,,2016-07-10 22:43:09,
441301348676415488,127895572,0,0,1,1,0,0,2019-03-07 19:56:55,Shooting crew of porn movies. #TheWorstJobToHave


In [18]:
def IsInteger(num):
    try:
        val = float(num)
        return float.is_integer(val)
    except ValueError:
        return False

def count_integers(df, attr):
    numbers = df_tweets[attr].apply(IsInteger).sum()
    print("Number of not NaN integers", numbers)
    
def object_to_numeric(df, attr):
    print("Type before conversion:", df[attr].dtype)
    print("Missing values before conversion:", df[attr].isna().sum())
    # coerces non-numeric junk to NaNs
    df[attr] = pd.to_numeric(df[attr], errors='coerce')
    print("Missing values after conversion:", df[attr].isna().sum())
    print("Type after conversion:", df[attr].dtype)

In [19]:
count_integers(df_tweets, "retweet_count")

Number of not NaN integers 11086636


In [20]:
object_to_numeric(df_tweets, "retweet_count")

Type before conversion: object
Missing values before conversion: 437134
Missing values after conversion: 625456
Type after conversion: float64


In [21]:
count_integers(df_tweets, "reply_count")

Number of not NaN integers 10902593


In [22]:
object_to_numeric(df_tweets, "reply_count")

Type before conversion: object
Missing values before conversion: 647873
Missing values after conversion: 809499
Type after conversion: float64


In [23]:
count_integers(df_tweets, "favorite_count")

Number of not NaN integers 10903997


In [24]:
object_to_numeric(df_tweets, "favorite_count")

Type before conversion: object
Missing values before conversion: 647541
Missing values after conversion: 808096
Type after conversion: float64


In [25]:
count_integers(df_tweets, "num_hashtags")

Number of not NaN integers 10548432


In [26]:
object_to_numeric(df_tweets, "num_hashtags")

Type before conversion: object
Missing values before conversion: 1057508
Missing values after conversion: 1163661
Type after conversion: float64


In [27]:
count_integers(df_tweets, "num_urls")

Number of not NaN integers 10903152


In [28]:
object_to_numeric(df_tweets, "num_urls")

Type before conversion: object
Missing values before conversion: 648611
Missing values after conversion: 808941
Type after conversion: float64


In [29]:
count_integers(df_tweets, "num_mentions")

Number of not NaN integers 10724119


In [30]:
object_to_numeric(df_tweets, "num_mentions")

Type before conversion: object
Missing values before conversion: 854149
Missing values after conversion: 987974
Type after conversion: float64


In [31]:
df_tweets.created_at = pd.to_datetime(df_tweets.created_at, format='%Y-%m-%d %H:%M:%S')

In [32]:
df_tweets.dtypes

user_id                   object
retweet_count            float64
reply_count              float64
favorite_count           float64
num_hashtags             float64
num_urls                 float64
num_mentions             float64
created_at        datetime64[ns]
text                      object
dtype: object