In [142]:
import pandas as pd

# 1. Load the file "total_data_merged.xlsx" and create a dataframe with it
df = pd.read_excel('total_data_merged.xlsx', engine='openpyxl',sheet_name = 'data323_comma')

# 2. Read the FIRST column and remove all string 'R' present in the cells
df.iloc[:, 0] = df.iloc[:, 0].str.replace('R', '', regex=False)

# 3. For each different ODL number read in column 5, assign a test ID
# Initialize the starting ID
starting_id = 366

# Create a dictionary to store the mapping of ODL numbers to test IDs
odl_to_test_id = {}

# Loop through each row in the dataframe
for index, row in df.iterrows():
    odl_number = row.iloc[4]
    if odl_number not in odl_to_test_id:
        odl_to_test_id[odl_number] = f'L{starting_id:06d}'
        starting_id += 1
    df.at[index, 'TEST.ID'] = odl_to_test_id[odl_number]

# 4. In column 8 named "GAS REFRIGERANTE", remove the characters " _PA.txt" from the cells
df['GAS REFRIGERANTE'] = df['GAS REFRIGERANTE'].str.replace(' _PA.txt', '', regex=False)

# Save the updated dataframe to a new Excel file
#df.to_excel('updated_total_data_merged.xlsx', index=False, engine='openpyxl')

print("The file has been processed and saved as 'updated_total_data_merged.xlsx'.")

The file has been processed and saved as 'updated_total_data_merged.xlsx'.


In [143]:
df = df.sort_values(by='TEST.ID')


In [144]:
# Count the number of unique test IDs
num_test_ids = df['TEST.ID'].nunique()
print(f"The number of unique test IDs is: {num_test_ids}")

The number of unique test IDs is: 195


In [145]:
# Calculate the total amount of time of the tests by summing values in the "Duration" column

# Convert the "Duration" column to timedelta
df['Duration'] = df['Duration'].replace('Cannot be calculated', pd.NaT)
df['Duration'] = pd.to_timedelta(df['Duration'])
total_duration = df['Duration'].sum()

print(f"The total registered time of the tests is: {total_duration}")

The total registered time of the tests is: 16 days 16:04:28


In [146]:
df = df.iloc[1:].reset_index(drop=True)
# Read the string in the cells for every row in the column "DATA/ORA COLLAUDO"
# Count 12 characters and remove the rest
df['DATA/ORA COLLAUDO'] = df['DATA/ORA COLLAUDO'].str[:10]

# Convert the cells to date time format dd/mm/yyyy
df['DATA/ORA COLLAUDO'] = pd.to_datetime(df['DATA/ORA COLLAUDO'], format='%d/%m/%Y', errors='coerce')

# Drop rows with NaT values in 'DATA/ORA COLLAUDO' column
df = df.dropna(subset=['DATA/ORA COLLAUDO'])

# Create two different dataframes, df_2023 for rows that contain data from 2023 and df_2024 with data from 2024
df_2023 = df[df['DATA/ORA COLLAUDO'].dt.year == 2023]
df_2024 = df[df['DATA/ORA COLLAUDO'].dt.year == 2024]


In [147]:
# Count the number of unique test IDs
num_test_ids_2023 = df_2023['TEST.ID'].nunique()
num_test_ids_2024 = df_2024['TEST.ID'].nunique()
print(f"The number of unique test IDs in 2023 is: {num_test_ids_2023}")
print(f"The number of unique test IDs in 2024 is: {num_test_ids_2024}")

The number of unique test IDs in 2023 is: 114
The number of unique test IDs in 2024 is: 80


In [148]:
# Calculate the total amount of time of the tests by summing values in the "Duration" column
total_duration_2023 = df_2023['Duration'].sum()
total_duration_2024 = df_2024['Duration'].sum()
print(f"The total registered time in 2023  is: {total_duration_2023}")
print(f"The total registered time in 2024  is: {total_duration_2024}")

The total registered time in 2023  is: 8 days 22:45:36
The total registered time in 2024  is: 7 days 15:05:19


In [149]:
df_2023_60 = df_2023[df_2023['STAZIONE DI COLLAUDO'] == "60"]
df_2023_110 = df_2023[df_2023['STAZIONE DI COLLAUDO'] == "110"]

df_2024_60 = df_2024[df_2024['STAZIONE DI COLLAUDO'] == "60"]
df_2024_110 = df_2024[df_2024['STAZIONE DI COLLAUDO'] == "110"]

In [150]:
# Count the number of unique test IDs
num_test_60_2023 = df_2023_60['TEST.ID'].nunique()
print(f"The number of test for cell 60 in 2023: {num_test_60_2023}")

num_test_110_2023 = df_2023_110['TEST.ID'].nunique()
print(f"The number of test for cell 110 in 2023: {num_test_110_2023}")

num_test_60_2024 = df_2024_60['TEST.ID'].nunique()
print(f"The number of test for cell 60 in 2024: {num_test_60_2024}")

num_test_110_2024 = df_2024_110['TEST.ID'].nunique()
print(f"The number of test for cell 110 in 2024: {num_test_110_2024}")

The number of test for cell 60 in 2023: 50
The number of test for cell 110 in 2023: 70
The number of test for cell 60 in 2024: 31
The number of test for cell 110 in 2024: 56


In [151]:
# Count the number of unique test IDs
df_2023_60_water = df_2023_60[df_2023_60['FLUIDO'] == "ACQUA"]
num_test_60_2023_water = df_2023_60_water['TEST.ID'].nunique()
print(f"The number of test (WATER) for cell 60 in 2023: {num_test_60_2023_water}")

df_2023_60_glicol = df_2023_60[df_2023_60['FLUIDO'] == "GLICOLE"]
num_test_60_2023_glicol = df_2023_60_glicol['TEST.ID'].nunique()
print(f"The number of test (GLYCOL) for cell 60 in 2023: {num_test_60_2023_glicol}")

df_2023_60_oil = df_2023_60[df_2023_60['FLUIDO'] == "OLIO"]
num_test_60_2023_oil= df_2023_60_oil['TEST.ID'].nunique()
print(f"The number of test (OIL) for cell 60 in 2023: {num_test_60_2023_oil}")

df_2024_60_water = df_2024_60[df_2024_60['FLUIDO'] == "ACQUA"]
num_test_60_2024_water = df_2024_60_water['TEST.ID'].nunique()
print(f"The number of test (WATER) for cell 60 in 2024: {num_test_60_2024_water}")

df_2024_60_glicol = df_2024_60[df_2024_60['FLUIDO'] == "GLICOLE"]
num_test_60_2024_glicol = df_2024_60_glicol['TEST.ID'].nunique()
print(f"The number of test (GLYCOL) for cell 60 in 2024: {num_test_60_2024_glicol}")

df_2024_60_oil = df_2024_60[df_2024_60['FLUIDO'] == "OLIO"]
num_test_60_2024_oil= df_2024_60_oil['TEST.ID'].nunique()
print(f"The number of test (OIL) for cell 60 in 2024: {num_test_60_2024_oil}")



The number of test (WATER) for cell 60 in 2023: 43
The number of test (GLYCOL) for cell 60 in 2023: 4
The number of test (OIL) for cell 60 in 2023: 3
The number of test (WATER) for cell 60 in 2024: 24
The number of test (GLYCOL) for cell 60 in 2024: 3
The number of test (OIL) for cell 60 in 2024: 4


In [152]:
df_2023_110_water = df_2023_110[df_2023_110['FLUIDO'] == "ACQUA"]
num_test_110_2023_water = df_2023_110_water['TEST.ID'].nunique()
print(f"The number of test (WATER) for cell 110 in 2023: {num_test_110_2023_water}")

df_2023_110_glicol = df_2023_110[df_2023_110['FLUIDO'] == "GLICOLE"]
num_test_110_2023_glicol = df_2023_110_glicol['TEST.ID'].nunique()
print(f"The number of test (GLYCOL) for cell 110 in 2023: {num_test_110_2023_glicol}")

df_2023_110_oil = df_2023_110[df_2023_110['FLUIDO'] == "OLIO"]
num_test_110_2023_oil= df_2023_110_oil['TEST.ID'].nunique()
print(f"The number of test (OIL) for cell 110 in 2023: {num_test_110_2023_oil}")

df_2024_110_water = df_2024_110[df_2024_110['FLUIDO'] == "ACQUA"]
num_test_110_2024_water = df_2024_110_water['TEST.ID'].nunique()
print(f"The number of test (WATER) for cell 110 in 2024: {num_test_110_2024_water}")

df_2024_110_glicol = df_2024_110[df_2024_110['FLUIDO'] == "GLICOLE"]
num_test_110_2024_glicol = df_2024_110_glicol['TEST.ID'].nunique()
print(f"The number of test (GLYCOL) for cell 110 in 2024: {num_test_110_2024_glicol}")

df_2024_110_oil = df_2024_110[df_2024_110['FLUIDO'] == "OLIO"]
num_test_110_2024_oil= df_2024_110_oil['TEST.ID'].nunique()
print(f"The number of test (OIL) for cell 110 in 2024: {num_test_110_2024_oil}")

The number of test (WATER) for cell 110 in 2023: 48
The number of test (GLYCOL) for cell 110 in 2023: 11
The number of test (OIL) for cell 110 in 2023: 14
The number of test (WATER) for cell 110 in 2024: 45
The number of test (GLYCOL) for cell 110 in 2024: 3
The number of test (OIL) for cell 110 in 2024: 9


In [153]:
df_2023_Carlini = df_2023_60[df_2023_60['OPERATORE'].str.contains("CARLINI", na=False)]
num_test_60_2023_Carlini = df_2023_Carlini['TEST.ID'].nunique()
print(f"Carlini in 2023 cell 60 performed this amount of tests: {num_test_60_2023_Carlini}")

df_2023_Restani = df_2023_60[df_2023_60['OPERATORE'].str.contains("RESTANI", na=False)]
num_test_60_2023_Restani = df_2023_Restani['TEST.ID'].nunique()
print(f"Restani in 2023 cell 60 performed this amount of tests: {num_test_60_2023_Restani}")

Carlini in 2023 cell 60 performed this amount of tests: 19
Restani in 2023 cell 60 performed this amount of tests: 37


In [154]:
# Check if there are multiple operators for the same test
test_operator_counts = df.groupby('TEST.ID')['OPERATORE'].nunique()
tests_with_multiple_operators = test_operator_counts[test_operator_counts > 1]
table_multiple_operators = pd.DataFrame(tests_with_multiple_operators)

# Handle cases with multiple operators
for test_id in tests_with_multiple_operators.index:
    test_rows = df[df['TEST.ID'] == test_id]
    for index, row in test_rows.iterrows():
        if row['STAZIONE DI COLLAUDO'] == 60:
            df.at[index, 'OPERATORE'] = 'CARLINI MIRCO'
        else:
            df.at[index, 'OPERATORE'] = 'RESTANI MATTEO'

# Group by 'operatore' and count the number of unique 'TEST.ID' for each operator
operator_test_counts = df.groupby('OPERATORE')['TEST.ID'].nunique()

# Calculate the total number of tests
total_tests = df['TEST.ID'].nunique()

# Calculate the percentage of tests carried out by each operator
operator_test_percentages = (operator_test_counts / total_tests) * 100

# Combine the counts and percentages into a single DataFrame
operator_test_summary = pd.DataFrame({
    'Number of Tests': operator_test_counts,
    'Percentage of Total Tests': operator_test_percentages
})

print(operator_test_summary)

                   Number of Tests  Percentage of Total Tests
OPERATORE                                                    
1 CARLINI MIRCO                 53                  27.461140
1 GUERZONI GIULIO                1                   0.518135
1 RESTANI MATTEO               109                  56.476684
RESTANI MATTEO                  30                  15.544041


In [180]:
table_multiple_operators.head(1000)

Unnamed: 0_level_0,OPERATORE
TEST.ID,Unnamed: 1_level_1
L000367,2
L000372,2
L000373,2
L000376,2
L000381,2
L000382,2
L000384,2
L000389,2
L000396,2
L000397,2


In [172]:
# Read the values in the first column ['TEST.ID'] of dataframe table_multiple_operators
test_ids_multiple_operators = table_multiple_operators.index.values.tolist()
# Replace the value at column "OPERATORE" by value "MULTIPLE" in dataframe df when the same values are found in column "TEST.ID"
df.loc[df['TEST.ID'].isin(test_ids_multiple_operators), 'OPERATORE'] = 'MULTIPLE'


In [175]:
# Check if there are multiple operators for the same test
test_operator_counts2 = df.groupby('TEST.ID')['OPERATORE'].nunique()
tests_with_multiple_operators2 = test_operator_counts2[test_operator_counts2 > 1]
table_multiple_operators2 = pd.DataFrame(tests_with_multiple_operators2)

# Handle cases with multiple operators
for test_id in tests_with_multiple_operators2.index:
    test_rows = df[df['TEST.ID'] == test_id]
    for index, row in test_rows.iterrows():
        if row['STAZIONE DI COLLAUDO'] == 60:
            df.at[index, 'OPERATORE'] = 'CARLINI MIRCO'
        else:
            df.at[index, 'OPERATORE'] = 'RESTANI MATTEO'

# Group by 'operatore' and count the number of unique 'TEST.ID' for each operator
operator_test_counts = df.groupby('OPERATORE')['TEST.ID'].nunique()

# Calculate the total number of tests
total_tests = df['TEST.ID'].nunique()

# Calculate the percentage of tests carried out by each operator
operator_test_percentages = (operator_test_counts / total_tests) * 100

# Combine the counts and percentages into a single DataFrame
operator_test_summary = pd.DataFrame({
    'Number of Tests': operator_test_counts,
    'Percentage of Total Tests': operator_test_percentages
})

print(operator_test_summary)

                   Number of Tests  Percentage of Total Tests
OPERATORE                                                    
1 CARLINI MIRCO                 53                  27.461140
1 GUERZONI GIULIO                1                   0.518135
1 RESTANI MATTEO               109                  56.476684
MULTIPLE                        30                  15.544041
