# Validating Your Data

## Figuring out what’s in your data & Removing duplicates

In [1]:
#First, the libraries are imported.
from lxml import objectify
import pandas as pd

#parse means 'ayristirma'
xml = objectify.parse(open('XMLData2.xml'))

#You obtain access to the root node using the getroot() method. Because XML are structured over a root node, analog to a tree.
root = xml.getroot()

#Data handling relies on a DataFrame.
df = pd.DataFrame(columns=('Number', 'String', 'Boolean'))

for i in range(0, 4):
    obj = root.getchildren()[i].getchildren()
    
    df.loc[i, 'Number'] = obj[0].text
    df.loc[i, 'String'] = obj[1].text
    df.loc[i, 'Boolean'] = obj[2].text
    # Print extracted values for debugging
    #print([obj[0].text, obj[1].text, obj[2].text])

print("Original DataFrame:")
print()
print(df.drop_duplicates())
print()
    
#Check for duplicates
duplicates = df[df.duplicated()]
print("Duplicate Rows:")
print()
print(duplicates)

Original DataFrame:

  Number  String Boolean
0      1   First    True
1      2  Second   False
2      3   Third    True

Duplicate Rows:

  Number String Boolean
3      3  Third    True


## Creating a data map and data plan

In [2]:
#Modified

import pandas as pd
pd.set_option('display.width', 55)

df = pd.DataFrame({'A': [0,0,0,0,0,1,1,1,1,1,1],
                   'B': [1,2,3,5,4,2,5,6,7,5,4],
                   'C': [5,3,4,1,1,2,3,4,3,2,1]})

a_group_desc = df.groupby('A').describe()
print(a_group_desc)

      B                                            \
  count      mean       std  min   25%  50%   75%   
A                                                   
0   5.0  3.000000  1.581139  1.0  2.00  3.0  4.00   
1   6.0  4.833333  1.722401  2.0  4.25  5.0  5.75   

           C                                          
   max count mean       std  min  25%  50%  75%  max  
A                                                     
0  5.0   5.0  2.8  1.788854  1.0  1.0  3.0  4.0  5.0  
1  7.0   6.0  2.5  1.048809  1.0  2.0  2.5  3.0  4.0  


In [3]:
stacked = a_group_desc.stack()
print(stacked)

                B         C
A                          
0 count  5.000000  5.000000
  mean   3.000000  2.800000
  std    1.581139  1.788854
  min    1.000000  1.000000
  25%    2.000000  1.000000
  50%    3.000000  3.000000
  75%    4.000000  4.000000
  max    5.000000  5.000000
1 count  6.000000  6.000000
  mean   4.833333  2.500000
  std    1.722401  1.048809
  min    2.000000  1.000000
  25%    4.250000  2.000000
  50%    5.000000  2.500000
  75%    5.750000  3.000000
  max    7.000000  4.000000


In [4]:
print(a_group_desc.loc[:,(slice(None),['count','mean']),])

      B               C     
  count      mean count mean
A                           
0   5.0  3.000000   5.0  2.8
1   6.0  4.833333   6.0  2.5


# Manipulating Categorical Variables

## Sidebar: Checking your version of pandas

In [5]:
import pandas as pd
print(pd.__version__)

2.1.4


## Creating categorical variables

In [6]:
import pandas as pd

#catergorical variable is created. dytpe set to category
car_colors = pd.Series(['Blue', 'Red', 'Green'],
                       dtype='category')

#NaN is created when there is no match.
car_data = pd.Series(
    pd.Categorical(
        ['Yellow', 'Green', 'Red', 'Blue', 'Purple'], 
                   categories=car_colors, ordered=False))

#You ask panda which entries are actually null.
find_entries = pd.isnull(car_data)

print(car_colors)
print()
print(car_data)
print()

#This verifies the fact for you.
print(find_entries[find_entries == True])

0     Blue
1      Red
2    Green
dtype: category
Categories (3, object): ['Blue', 'Green', 'Red']

0      NaN
1    Green
2      Red
3     Blue
4      NaN
dtype: category
Categories (3, object): ['Blue', 'Green', 'Red']

0    True
4    True
dtype: bool


## Renaming levels

In [7]:
#The error you're encountering is due to the fact that the 'categories' 
#property of a Categorical object in pandas is not directly modifiable.
import pandas as pd

car_colors = pd.Series(['Blue', 'Red', 'Green'],
                       dtype='category')
car_data = pd.Series(
    pd.Categorical(
        ['Blue', 'Green', 'Red', 'Blue', 'Red'],
        categories=car_colors, ordered=False))

new_categories = ["Purple", "Yellow", "Mauve"]

# Create a new categorical variable with the desired categories
new_car_colors = pd.Categorical(['Purple', 'Yellow', 'Mauve'], categories=new_categories, ordered=False)

# Assign the new categories to the car_data
car_data = pd.Series(new_car_colors)

print(car_data)

0    Purple
1    Yellow
2     Mauve
dtype: category
Categories (3, object): ['Purple', 'Yellow', 'Mauve']


## Combining levels

In [8]:
import pandas as pd

car_colors = pd.Series(['Blue', 'Red', 'Green'],
    dtype='category')
car_data = pd.Series(
    pd.Categorical(
       ['Blue', 'Green', 'Red', 'Green', 'Red', 'Green'],
       categories=car_colors, ordered=False))

car_data = car_data.cat.set_categories(
    ["Blue", "Red", "Green", "Blue_Red"])

#Combining Blue and Red together is a two-step process. First, you add the Blue_Red to car_data.
#Then you change Red and Blue entries to Blue_Red.
#print(car_data.loc[car_data.isin(['Red'])])

#isin() locates the Red entries, and loc[], which obtains their index. 
car_data.loc[car_data.isin(['Red'])] = 'Blue_Red'
car_data.loc[car_data.isin(['Blue'])] = 'Blue_Red'

#As a final step, you can remove the unneeded categories.
car_data = car_data.cat.set_categories(
    ["Green", "Blue_Red"])

print()
print(car_data)


0    Blue_Red
1       Green
2    Blue_Red
3       Green
4    Blue_Red
5       Green
dtype: category
Categories (2, object): ['Green', 'Blue_Red']


# Dealing with Dates in Your Data

## Formatting date and time values

In [9]:
import datetime as dt

now = dt.datetime.now()

print(str(now))
print(now.strftime('%a, %d %B %Y'))

2024-01-01 15:37:21.846715
Mon, 01 January 2024


## Using the right time transformation

In [10]:
import datetime as dt

#You can use days, seconds, microseconds, milliseconds, minutes, hours, weeks.
#now is the local time.
now = dt.datetime.now()
timevalue = now + dt.timedelta(hours=2)

print(now.strftime('%H:%M:%S'))
print(timevalue.strftime('%H:%M:%S'))
print(timevalue - now)

15:37:21
17:37:21
2:00:00


# Dealing with Missing Data

## Finding the missing data

In [11]:
import pandas as pd
import numpy as np

s = pd.Series([1, 2, 3, np.NaN, 5, 6, None])

print(s.isnull())

print()
print(s[s.isnull()])

0    False
1    False
2    False
3     True
4    False
5    False
6     True
dtype: bool

3   NaN
6   NaN
dtype: float64


## Encoding missingness

In [12]:
import pandas as pd
import numpy as np

s = pd.DataFrame({'A': [0,0,0,0,0,0,0],
                  'B': [1, 2, 3, np.NaN, 5, 6, None]})

#.desribe() gives you count, mean, std , min, max values etc.
group_desc = s.groupby('A').describe()
print(s)
print()
print(group_desc)
print()

#.fillna fills the gap with the mean value.
#s.mean worked.
print(s.fillna(s.mean()))
print()

#.dropna removes np.NaN
print(s.dropna())

   A    B
0  0  1.0
1  0  2.0
2  0  3.0
3  0  NaN
4  0  5.0
5  0  6.0
6  0  NaN

      B                                        
  count mean       std  min  25%  50%  75%  max
A                                              
0   5.0  3.4  2.073644  1.0  2.0  3.0  5.0  6.0

   A    B
0  0  1.0
1  0  2.0
2  0  3.0
3  0  3.4
4  0  5.0
5  0  6.0
6  0  3.4

   A    B
0  0  1.0
1  0  2.0
2  0  3.0
4  0  5.0
5  0  6.0


## Imputing missing data

In [13]:

#Modified code gpt.
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

#Create a sample dataset.
s = [[1, 2, 3, np.NaN, 5, 6, None]]

#We define what to look for. axis=0 to impute along columns. axis=1 to impute along rows. 
#stragety can be mean, median, most_frequent.
#imp = SimpleImputer(missing_values='NaN',
              #strategy='mean', axis=0)
imp = SimpleImputer(strategy='mean')

#statistics for imputer. It has seven inputs == len(s)
imp.fit([[1, 2, 3, 4, 5, 6, 7]])

#.tolist converts output to a list.
#.transform() on s to fill in the missing values.
x = pd.Series(imp.transform(s).tolist()[0])

print(x)

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
6    7.0
dtype: float64


# Slicing and Dicing: Filtering and Selecting Data

## Slicing rows

In [14]:

#The following example builds a 3-D array. 
x = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9],],
             [[11,12,13], [14,15,16], [17,18,19],],
             [[21,22,23], [24,25,26], [27,28,29]]])

#It then slices row 1 of that array to produce the following output:
x[1]

array([[11, 12, 13],
       [14, 15, 16],
       [17, 18, 19]])

## Slicing columns

In [15]:
x = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9],],
             [[11,12,13], [14,15,16], [17,18,19],],
             [[21,22,23], [24,25,26], [27,28,29]]])

#The indexing now occurs at two levels. The first index refers to row. Using the colon(:) means to use all the rows. It prints the whole column 1.
x[:,1]

array([[ 4,  5,  6],
       [14, 15, 16],
       [24, 25, 26]])

## Dicing

In [16]:
x = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9],],
             [[11,12,13], [14,15,16], [17,18,19],],
             [[21,22,23], [24,25,26], [27,28,29]]])

#row 1, column 1.
print(x[1,1])

#whole row, column 1, z = 1.
print(x[:,1,1])

#row 1, z = 1
print(x[1,:,1])
print()

#rows 1 and 2, columns 1 and 2.
print(x[1:3, 1:3])

[14 15 16]
[ 5 15 25]
[12 15 18]

[[[14 15 16]
  [17 18 19]]

 [[24 25 26]
  [27 28 29]]]


# Concatenating and Transforming

## Adding new cases and variables

In [9]:
import pandas as pd

df = pd.DataFrame({'A': [2,3,1],
                   'B': [1,2,3],
                   'C': [5,3,4]})

df1 = pd.DataFrame({'A': [4],
                    'B': [4],
                    'C': [4]})

#The easiest way to add more data is to rely on the append() method.
#append() has been removed in panda 2.0+, use .concat instead.
df = pd.concat([df, df1])

#Use the reset_index() method to create a new index to make accessing cases earlier.
df = df.reset_index(drop=True)
print(df)

#Alternatively, you can create the new case directly.
df.loc[df.last_valid_index() + 1] = [5, 5, 5]
print()
print(df)

#Sometimes, you need add a new variable to the DataFrame. In this case, you rely on join() to perform the task.
#
df2 = pd.DataFrame({'D': [1, 2, 3, 4, 5]})

df = pd.DataFrame.join(df, df2)
print()
print(df)

   A  B  C
0  2  1  5
1  3  2  3
2  1  3  4
3  4  4  4

   A  B  C
0  2  1  5
1  3  2  3
2  1  3  4
3  4  4  4
4  5  5  5

   A  B  C  D
0  2  1  5  1
1  3  2  3  2
2  1  3  4  3
3  4  4  4  4
4  5  5  5  5


## Removing data

In [None]:
import pandas as pd

df = pd.DataFrame({'A': [2,3,1],
                   'B': [1,2,3],
                   'C': [5,3,4]})

#To remove just one case.
df = df.drop(df.index[[1]])
print(df)

#This example shows how to remove a column using a column name. In both cases, you must specify an axis as part of
#the removal process. Note the correction by gpt: axis = 1.
df = df.drop('B', axis=1)
print()
print(df)

## Sorting and shuffling

In [12]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'A': [2,1,2,3,3,5,4],
                   'B': ['Fatih',2,3,5,4,2,5],
                   'C': [5,3,4,1,1,2,3]})

#To sort the data, you use .sort_values, ascending/descending.
#A or B will be sorted, the rest keeps up.
df = df.sort_values(by=['A', 'B'], ascending=[True, True])


#Make sure to always call reset_index() when you' re done. 
df = df.reset_index(drop=True)
print(df)

#First, acquire the current index by:
index = df.index.tolist()

#Now, create a new order for index.
np.random.shuffle(index)

#Apply the new order to df using loc[].
df = df.loc[df.index[index]]

#As always, you call reset_index() to finalize the new order.
df = df.reset_index(drop=True)
print()
print(df)

   A      B  C
0  1      2  3
1  2      3  4
2  2  Fatih  5
3  3      4  1
4  3      5  1
5  4      5  3
6  5      2  2

   A      B  C
0  3      4  1
1  5      2  2
2  2      3  4
3  4      5  3
4  1      2  3
5  3      5  1
6  2  Fatih  5


In [18]:
#Manuel
import pandas as pd
import numpy as np

df = pd.DataFrame({'Adi': ['Ayse','Fatma','Ali','Veli'],
                   'Soyadi': ['Kaya','Celik','Vodafone','Deli'],
                   'TCKN': [5247727,3272772, 2727277, 2272727]})

#To sort the data, you use .sort_values, ascending/descending.
#A or B will be sorted, the rest keeps up.
df = df.sort_values(by=['TCKN'], ascending=[True])

df = df.reset_index(drop=True)
print(df)

     Adi    Soyadi     TCKN
0   Veli      Deli  2272727
1    Ali  Vodafone  2727277
2  Fatma     Celik  3272772
3   Ayse      Kaya  5247727


# Aggregating Data at Any Level

In [8]:

#Modified by gpt: To address these warnings and future-proof your code, you can pass the strings "sum", "mean", and 
#"var" directly to the transform function.
import pandas as pd
import numpy as np

df = pd.DataFrame({'Map': [0,0,0,1,1,2,2],
                   'Values': [1,2,3,5,4,2,5]})

#To perform the aggregation, you must first call groupby() to map the group values.
#You then index into Values and rely on the transform() to create the aggregated data using one of algorithms found in NumPy.
df['S'] = df.groupby('Map')['Values'].transform('sum')
df['M'] = df.groupby('Map')['Values'].transform('mean')
df['V'] = df.groupby('Map')['Values'].transform('var')


print(df)

   Map  Values  S    M    V
0    0       1  6  2.0  1.0
1    0       2  6  2.0  1.0
2    0       3  6  2.0  1.0
3    1       5  9  4.5  0.5
4    1       4  9  4.5  0.5
5    2       2  7  3.5  4.5
6    2       5  7  3.5  4.5
