In [1]:
import pandas as pd
train_data=pd.read_csv("train.csv")
train_data.head(1)

Unnamed: 0,UID,AgriculturalPostalZone,AgricultureZoningCode,CropFieldConfiguration,CropSpeciesVariety,CultivatedAndWildArea,CultivatedAreaSqft1,DistrictId,FarmClassification,FarmEquipmentArea,...,TotalTaxAssessed,TotalValue,TownId,TypeOfIrrigationSystem,UndergroundStorageSqft,ValuationYear,WaterAccessPoints,WaterAccessPointsCalc,WaterReservoirCount,Target
0,12998,291674,0.0,,3.0,,1136.0,1.0,,,...,8636.716,456255.6,118.0,,,2018.0,2.0,2.0,,high


In [2]:
# Count missing values for each column
missing_values = train_data.isnull().sum()

# Create a DataFrame to store the count of missing values
missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'MissingCount': missing_values.values
})

# Add a column to show the percentage of missing values
missing_df['MissingPercentage'] = (missing_df['MissingCount'] / len(train_data)) * 100

# Sort the DataFrame by the number of missing values in descending order
missing_df.sort_values(by='MissingCount', ascending=False, inplace=True)

# Reset index for readability
missing_df.reset_index(drop=True, inplace=True)

# Display the DataFrame
print(missing_df)

                          Column  MissingCount  MissingPercentage
0             FarmClassification        112552          99.984898
1       PerimeterGuardPlantsArea        112525          99.960913
2         UndergroundStorageSqft        112512          99.949364
3                 FieldZoneLevel        112512          99.949364
4             HarvestStorageSqft        112457          99.900505
5                  HasGreenHouse        112305          99.765477
6         CropFieldConfiguration        112274          99.737939
7          FieldConstructionType        112239          99.706846
8          CultivatedAndWildArea        112027          99.518518
9                FieldShadeCover        111701          99.228917
10                 ReservoirType        111477          99.029928
11            TotalReservoirSize        111332          98.901118
12           ReservoirWithFilter        111032          98.634615
13                HasPestControl        109940          97.664544
14        

In [3]:
# Drop columns with missing percentage greater than 60%
columns_to_drop = missing_df[missing_df['MissingPercentage'] > 60]['Column'].tolist()

# Drop the identified columns from the DataFrame
train_data.drop(columns=columns_to_drop, axis=1, inplace=True)

# Display the updated DataFrame shape after dropping columns
print(f"Updated shape of the DataFrame: {train_data.shape}")

Updated shape of the DataFrame: (112569, 28)


In [4]:
# Set the 'UID' column as the index
train_data.set_index('UID', inplace=True)

# Display the updated DataFrame to confirm the change
print(train_data.head(1))

       AgriculturalPostalZone  AgricultureZoningCode  CropSpeciesVariety  \
UID                                                                        
12998                  291674                    0.0                 3.0   

       CultivatedAreaSqft1  DistrictId  FarmingUnitCount  \
UID                                                        
12998               1136.0         1.0               1.0   

       FieldEstablishedYear  FieldSizeSqft  HarvestProcessingType  \
UID                                                                 
12998                1926.0         6265.0                    2.0   

       LandUsageType  ...  TaxAgrarianValue  TaxLandValue  \
UID                   ...                                   
12998            1.0  ...           81652.8      374602.8   

       TotalCultivatedAreaSqft  TotalTaxAssessed  TotalValue  TownId  \
UID                                                                    
12998                   1136.0          8636.716    45

In [5]:
# Define the mapping for 'Target' column
target_mapping = {'low': 0, 'medium': 1, 'high': 2}

# Apply the mapping to the 'Target' column
train_labels = train_data['Target'].map(target_mapping)

# Display the first few rows of the labels to verify the mapping
print(train_labels.head())

UID
12998     2
20860     1
75725     1
106521    0
99467     1
Name: Target, dtype: int64


In [6]:
train_data = train_data.drop(columns=['TownId','Target','DistrictId'])

In [7]:
def fill_missing_values(df):
    # Define the columns based on their type
    categorical_columns = [
        'HarvestProcessingType', 'SoilFertilityType', 'AgricultureZoningCode',
        'ValuationYear', 'NationalRegionCode', 'StorageAndFacilityCount', 'RawLocationId',
        'LandUsageType', 'CropSpeciesVariety', 'AgriculturalPostalZone'
    ]
    
    median_columns = [
        'FarmingUnitCount', 'FieldSizeSqft', 'CultivatedAreaSqft1', 'MainIrrigationSystemCount',
        'FieldEstablishedYear', 'TotalTaxAssessed', 'TaxLandValue', 'TotalCultivatedAreaSqft',
        'WaterAccessPoints', 'TaxAgrarianValue', 'TotalValue'
    ]
    
    mean_columns = [
        'WaterAccessPointsCalc', 'Longitude', 'Latitude'
    ]
    
    # Convert categorical columns to 'object' type if necessary
    for column in categorical_columns:
        if column in df.columns:
            df[column] = df[column].astype('object')

    # Fill missing values for categorical columns using mode
    for column in categorical_columns:
        if column in df.columns:
            if df[column].isnull().sum() > 0:
                try:
                    mode_value = df[column].mode(dropna=True)[0] if not df[column].mode().empty else None
                    if mode_value is not None:
                        df[column].fillna(mode_value, inplace=True)
                    else:
                        print(f"Warning: Could not find a mode for column {column}")
                except Exception as e:
                    print(f"Error while filling mode for column {column}: {e}")
    
    # Fill missing values for numerical columns using median
    for column in median_columns:
        if column in df.columns and df[column].dtype in ['int64', 'float64']:
            if df[column].isnull().sum() > 0:
                median_value = df[column].median()
                df[column].fillna(median_value, inplace=True)
    
    # Fill missing values for numerical columns using mean
    for column in mean_columns:
        if column in df.columns and df[column].dtype in ['int64', 'float64']:
            if df[column].isnull().sum() > 0:
                mean_value = df[column].mean()
                df[column].fillna(mean_value, inplace=True)
    
    return df

# Fill missing values in the training data
train_data = fill_missing_values(train_data)

# Check if there are still missing values
missing_values = train_data.isnull().sum()
print("Missing values after filling:\n", missing_values[missing_values > 0])


Missing values after filling:
 Series([], dtype: int64)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(mode_value, inplace=True)
  df[column].fillna(mode_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermed

In [8]:
print(train_data.dtypes)

AgriculturalPostalZone        object
AgricultureZoningCode        float64
CropSpeciesVariety           float64
CultivatedAreaSqft1          float64
FarmingUnitCount             float64
FieldEstablishedYear         float64
FieldSizeSqft                float64
HarvestProcessingType        float64
LandUsageType                float64
Latitude                     float64
Longitude                    float64
MainIrrigationSystemCount    float64
NationalRegionCode           float64
RawLocationId                float64
SoilFertilityType            float64
StorageAndFacilityCount      float64
TaxAgrarianValue             float64
TaxLandValue                 float64
TotalCultivatedAreaSqft      float64
TotalTaxAssessed             float64
TotalValue                   float64
ValuationYear                float64
WaterAccessPoints            float64
WaterAccessPointsCalc        float64
dtype: object


In [9]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pandas as pd
import seaborn as sns


# Apply t-SNE to reduce the features to 2 dimensions
tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=300)
tsne_results = tsne.fit_transform(train_data)

# Create a DataFrame with t-SNE results
tsne_df = pd.DataFrame(tsne_results, columns=['Component 1', 'Component 2'])
tsne_df['Target'] = train_labels

# Plotting the t-SNE results
plt.figure(figsize=(10, 7))
sns.scatterplot(
    x='Component 1', y='Component 2',
    hue='Target',
    palette={0: 'blue', 1: 'orange', 2: 'green'},
    data=tsne_df,
    legend='full',
    alpha=0.7
)

plt.title('t-SNE Visualization of the Dataset')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.legend(title='Target', labels=['Low', 'Medium', 'High'])
plt.grid(True)
plt.show()


TypeError: TSNE.__init__() got an unexpected keyword argument 'max_iter'