In [67]:
import pdfplumber
import pandas as pd

# Create df from table on first page to act as the first df:
pdf_file = "encashment_details.pdf"
pdf = pdfplumber.open(pdf_file)
pages = pdf.pages
tbl = pages[0].extract_table()
original_df = pd.DataFrame(tbl)

# Use the first row as the header:
original_df.columns = original_df.iloc[0]
original_df = original_df[1:]  # Remove the first row, as it's now the header

# Append data from remaining tables/pages:
tables = []
with pdfplumber.open(pdf_file) as pdf:
    for i, pg in enumerate(pdf.pages[1:], start=1):  # Start from the second page (index 1)
        tbl = pg.extract_table()
        if tbl:
            df = pd.DataFrame(tbl)
            df.columns = original_df.columns  # Set the columns to match the header
            tables.append(df)

if tables:
    # Concatenate all dataframes in the tables list along axis=0
    original_df = pd.concat([original_df] + tables, ignore_index=True)

print(original_df)


0     Date of\nEncashment               Name of the Political Party  \
0             12/Apr/2019  ALL INDIA ANNA DRAVIDA MUNNETRA KAZHAGAM   
1             12/Apr/2019  ALL INDIA ANNA DRAVIDA MUNNETRA KAZHAGAM   
2             12/Apr/2019  ALL INDIA ANNA DRAVIDA MUNNETRA KAZHAGAM   
3             12/Apr/2019  ALL INDIA ANNA DRAVIDA MUNNETRA KAZHAGAM   
4             12/Apr/2019  ALL INDIA ANNA DRAVIDA MUNNETRA KAZHAGAM   
...                   ...                                       ...   
20416         24/Jan/2024                            JANASENA PARTY   
20417         24/Jan/2024                            JANASENA PARTY   
20418         24/Jan/2024                            JANASENA PARTY   
20419         24/Jan/2024                            JANASENA PARTY   
20420         24/Jan/2024                            JANASENA PARTY   

0     Denomination  
0        10,00,000  
1        10,00,000  
2      1,00,00,000  
3        10,00,000  
4        10,00,000  
...            ...  


In [68]:
# Save the DataFrame as a CSV file
original_df.to_csv('encashment_details_output.csv', index=False)


In [151]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('encashment_details_output.csv')

# Display basic statistics and information about the DataFrame
print(df.describe())


       Date of\nEncashment Name of the Political Party Denomination
count                20421                       20421        20421
unique                 230                          27            5
top            18/Apr/2019        BHARTIYA JANTA PARTY  1,00,00,000
freq                   596                        8633        12207


In [133]:
print(df.columns)

Index(['Date of\nEncashment', 'Name of the Political Party', 'Denomination'], dtype='object')


In [145]:
import pandas as pd

# Convert "Denomination" column to numeric type after removing commas
df['Denomination'] = df['Denomination'].str.replace(',', '').astype(float)

# Group by "Name of the Political Party" and find the sum of "Denomination"
grouped_df = df.groupby('Name of the Political Party')['Denomination'].sum().reset_index()

# Sort the grouped DataFrame in descending order based on "Denomination"
sorted_df = grouped_df.sort_values(by='Denomination', ascending=False)

# Convert "Denomination" column to numeric type after removing commas (assuming commas were already removed)
sorted_df['Denomination'] = sorted_df['Denomination'].astype(float)

# Divide "Denomination" values by 10 million to represent in crores
sorted_df['Denomination in Crores'] = sorted_df['Denomination'] / 10**7

# Display the DataFrame with Denomination in crores
print(sorted_df[['Name of the Political Party', 'Denomination in Crores']])



                       Name of the Political Party  Denomination in Crores
5                             BHARTIYA JANTA PARTY               6060.5111
3                     ALL INDIA TRINAMOOL CONGRESS               1609.5314
17         PRESIDENT, ALL INDIA CONGRESS COMMITTEE               1421.8655
4                           BHARAT RASHTRA SAMITHI               1214.7099
7                                  BIJU JANATA DAL                775.5000
8                          DMK PARTY IN PARLIAMENT                639.0000
26          YSR CONGRESS PARTY (YUVAJANA SRAMIKA R                337.0000
25                              TELUGU DESAM PARTY                218.8800
22                      SHIVSENA (POLITICAL PARTY)                158.3814
19                              RASTRIYA JANTA DAL                 72.5000
0                                  AAM AADMI PARTY                 65.4500
12                          JANATA DAL ( SECULAR )                 43.5000
24                       