# Demonstration of different pre-processing techniques including missing value handling and data discretization on any dataset

### Load the dataset and print top five rows

In [10]:
import pandas as pd
data=pd.read_csv("UPELECTIONS.csv")
print(data.head)

<bound method NDFrame.head of       Unnamed: 0                        user_name  \
0              0                       Mate_Ebong   
1              1                     Sameera Khan   
2              2                       seemaverma   
3              3                          Lucifer   
4              4                          shabbir   
...          ...                              ...   
3430        3430                      Tanvi Patel   
3431        3431                       INC akhter   
3432        3432              Ganesh Chand Rajwar   
3433        3433  Raji Loganathan- Always for INC   
3434        3434                     G. Kris Nair   

                       user_location  \
0                                NaN   
1                              India   
2                             India    
3                riyadh saudi arabia   
4                                NaN   
...                              ...   
3430               Chandigarh, India   
3431         

### Print the all the columns names

In [11]:
print(data.columns)

Index(['Unnamed: 0', 'user_name', 'user_location', 'user_description',
       'user_verified', 'date', 'text', 'hashtags', 'source'],
      dtype='object')


### Print the shape of data

In [12]:
print(data.shape)

(3435, 9)


### Find all null values in all the columns

In [13]:
print(data.isnull().sum())

Unnamed: 0             0
user_name              0
user_location       1255
user_description     629
user_verified          0
date                   0
text                   0
hashtags            1516
source                 0
dtype: int64


### Extract the information of dataset

In [14]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3435 entries, 0 to 3434
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        3435 non-null   int64 
 1   user_name         3435 non-null   object
 2   user_location     2180 non-null   object
 3   user_description  2806 non-null   object
 4   user_verified     3435 non-null   bool  
 5   date              3435 non-null   object
 6   text              3435 non-null   object
 7   hashtags          1919 non-null   object
 8   source            3435 non-null   object
dtypes: bool(1), int64(1), object(7)
memory usage: 218.2+ KB
None


### Describe the dataset

In [15]:
print(data.describe())

        Unnamed: 0
count  3435.000000
mean   1717.000000
std     991.743414
min       0.000000
25%     858.500000
50%    1717.000000
75%    2575.500000
max    3434.000000


### Replace all the NaN value in user_location column with India

In [25]:
data['user_location'] = data['user_location'].fillna("India")

### Replace all the NaN value in user_description columns with Employee

In [26]:
data['user_description'] = data['user_description'].fillna("Employee")

### Replace all the NaN value in hashtags column with UPELECTION

In [27]:
data['hashtags'] = data['hashtags'].fillna("UPElection")

### Verify the number of NaN value in each column

In [28]:
print(data.isnull().sum())

Unnamed: 0          0
user_name           0
user_location       0
user_description    0
user_verified       0
date                0
text                0
hashtags            0
source              0
dtype: int64


### Count the duplicate value in source column , here we see that the source of most of the user is Android

In [30]:
# Get count duplicates single column using dataframe.pivot_table()
data2 = data.pivot_table(index = ['source'], aggfunc ='size')
print(data2)

source
Blog2Social APP                 1
FS-Poster APP                   1
FenixApp                        1
Hocalwire Social Share          7
Information Critical            1
Naattuvartha                    1
Ryzely                          1
Semrush Social Media Tool       1
TweetCaster for Android         1
TweetDeck                       7
Twitter Media Studio            5
Twitter Web App               465
Twitter for Android          2607
Twitter for Mac                 1
Twitter for iPad                8
Twitter for iPhone            326
dlvr.it                         1
dtype: int64


### The most duplicates in hashtags column is ['UPElection'] , ignore UPElection that is I replace it with NaN.

In [39]:
data2 = data.pivot_table(index = ['hashtags'], aggfunc ='size',sort=False)
print(data2.sample)

<bound method NDFrame.sample of hashtags
UPElection                                                                           1516
['UPElection']                                                                        550
['AssemblyElections2022', 'RLD', 'SamajwadiParty', 'Mayawati']                          6
['BJP', 'UP', 'PMModi', 'Saharanpur']                                                   1
['LakhimpurKheri', 'UPElection', 'UPElections2022']                                     1
                                                                                     ... 
['UttarPradesh', 'UPElection2022', 'UPElection', 'KeshavPrasadMaurya', 'Lucknow']       1
['India', 'Pakistan']                                                                   1
['UPElections2022', 'UPElection', 'SamajwadiParty', 'BJP', 'AkhileshYadav']             1
['UPElections2022', 'UPElection2022', 'UPElection']                                     1
['AajKiBaat']                                              

In [48]:
data2 = data.pivot_table(index = ['text'], aggfunc ='size',sort=False)
print(data2.head())

text
RT @yoda_xyz: The high-stakes battle for Uttar Pradesh kicked off today. What will the Jat-dominant belt of the western UP vote for?\n\nPredi…                                                                                                                   3
#UPElection - important context https://t.co/TcVK8qyJiQ                                                                                                                                                                                                          1
RT @RubinaAfaqueIND: SP leader Vipin Manothia was beaten up by BJP workers at the polling station in Meerut, Uttar Pradesh, the police rema…                                                                                                                     7
Vote unitedly to route out BJP first in #AssemblyElections2022 followed by 2024.\n\n#RLD #SamajwadiParty #Mayawati #Congress #AIMIM #jayantchaudhary #Akhilesh #UPElections #UttarPradeshElections2022 #UPElections2022 #U

### Printing high frequency text

In [51]:
counts = data['text'].value_counts()
most_common_string = counts.idxmax()
print(most_common_string)

@sambitswaraj @narendramodi Kindly utilise your right to vote. Do not stay back. Share the message, ReTweet.

खुद भी करें..दूसरों से भी वोट करवाएं..

#vote #UPElection

#UP + #YOGI = #UPYOGI


### Printing high frequency user_name

In [52]:
counts = data['user_name'].value_counts()
most_common_string = counts.idxmax()
print(most_common_string)

Rockstar


### Printing high frequency date

In [53]:
counts = data['date'].value_counts()
most_common_string = counts.idxmax()
print(most_common_string)

2022-02-03 13:45:02+00:00


### Printing high frequency source

In [54]:
counts = data['source'].value_counts()
most_common_string = counts.idxmax()
print(most_common_string)

Twitter for Android


### Printing the second most frequency of user_description (here we not find first most because it Employee that is I replace it with NaN)

In [55]:
counts = data['user_description'].value_counts()
second_most_common_string = counts.index[1]
print(second_most_common_string)

Hari Bhakt
