In [2]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


In [3]:
# Step 1: Load the dataset
data = pd.read_csv('USArrests.csv')
data.head()


Unnamed: 0.1,Unnamed: 0,Murder,Assault,UrbanPop,Rape
0,Alabama,13.2,236,58,21.2
1,Alaska,10.0,263,48,44.5
2,Arizona,8.1,294,80,31.0
3,Arkansas,8.8,190,50,19.5
4,California,9.0,276,91,40.6


In [6]:
# Step 2: Data preprocessing (if required)
# Handling Missing Values (if any)
# Check for missing values
missing_values = data.isnull().sum()
print("Missing Values:\n", missing_values)



Missing Values:
 Unnamed: 0    0
Murder        0
Assault       0
UrbanPop      0
Rape          0
dtype: int64


In [8]:
# Step 3: Feature Scaling
# Standardize the numerical features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data.iloc[:, 1:])  # Exclude the first column (state names)
print(scaled_data)

[[ 1.25517927  0.79078716 -0.52619514 -0.00345116]
 [ 0.51301858  1.11805959 -1.22406668  2.50942392]
 [ 0.07236067  1.49381682  1.00912225  1.05346626]
 [ 0.23470832  0.23321191 -1.08449238 -0.18679398]
 [ 0.28109336  1.2756352   1.77678094  2.08881393]
 [ 0.02597562  0.40290872  0.86954794  1.88390137]
 [-1.04088037 -0.73648418  0.79976079 -1.09272319]
 [-0.43787481  0.81502956  0.45082502 -0.58583422]
 [ 1.76541475  1.99078607  1.00912225  1.1505301 ]
 [ 2.22926518  0.48775713 -0.38662083  0.49265293]
 [-0.57702994 -1.51224105  1.21848371 -0.11129987]
 [-1.20322802 -0.61527217 -0.80534376 -0.75839217]
 [ 0.60578867  0.94836277  1.21848371  0.29852525]
 [-0.13637203 -0.70012057 -0.03768506 -0.0250209 ]
 [-1.29599811 -1.39102904 -0.5959823  -1.07115345]
 [-0.41468229 -0.67587817  0.03210209 -0.34856705]
 [ 0.44344101 -0.74860538 -0.94491807 -0.53190987]
 [ 1.76541475  0.94836277  0.03210209  0.10439756]
 [-1.31919063 -1.06375661 -1.01470522 -1.44862395]
 [ 0.81452136  1.56654403  0.10

In [9]:
# Step 4: Perform PCA
pca = PCA(n_components=2)  # Keep 2 principal components
principal_components = pca.fit_transform(scaled_data)
print(principal_components)


[[ 0.98556588  1.13339238]
 [ 1.95013775  1.07321326]
 [ 1.76316354 -0.74595678]
 [-0.14142029  1.11979678]
 [ 2.52398013 -1.54293399]
 [ 1.51456286 -0.98755509]
 [-1.35864746 -1.08892789]
 [ 0.04770931 -0.32535892]
 [ 3.01304227  0.03922851]
 [ 1.63928304  1.2789424 ]
 [-0.91265715 -1.57046001]
 [-1.63979985  0.21097292]
 [ 1.37891072 -0.68184119]
 [-0.50546136 -0.15156254]
 [-2.25364607 -0.10405407]
 [-0.79688112 -0.2701647 ]
 [-0.75085907  0.95844029]
 [ 1.56481798  0.87105466]
 [-2.39682949  0.37639158]
 [ 1.76336939  0.42765519]
 [-0.48616629 -1.4744965 ]
 [ 2.10844115 -0.15539682]
 [-1.69268181 -0.63226125]
 [ 0.99649446  2.39379599]
 [ 0.69678733 -0.26335479]
 [-1.18545191  0.53687437]
 [-1.26563654 -0.19395373]
 [ 2.87439454 -0.7756002 ]
 [-2.38391541 -0.01808229]
 [ 0.18156611 -1.44950571]
 [ 1.98002375  0.14284878]
 [ 1.68257738 -0.82318414]
 [ 1.12337861  2.22800338]
 [-2.99222562  0.59911882]
 [-0.22596542 -0.74223824]
 [-0.31178286 -0.28785421]
 [ 0.05912208 -0.54141145]
 

In [11]:
# Step 5: Convert PCA results back to a DataFrame if needed
principal_df = pd.DataFrame(principal_components, columns=['PC1', 'PC2'])
final_df = pd.concat([data.iloc[:, 0], principal_df], axis=1)

In [12]:
final_df.head()

Unnamed: 0.1,Unnamed: 0,PC1,PC2
0,Alabama,0.985566,1.133392
1,Alaska,1.950138,1.073213
2,Arizona,1.763164,-0.745957
3,Arkansas,-0.14142,1.119797
4,California,2.52398,-1.542934


In [13]:
# Step 6: Calculate variance explained
variance_explained = pca.explained_variance_ratio_
variance_pc1 = variance_explained[0]  # Variance explained by PC1
variance_pc2 = variance_explained[1]  # Variance explained by PC2

print("Variance explained by PC1: {:.2f}%".format(variance_pc1 * 100))
print("Variance explained by PC2: {:.2f}%".format(variance_pc2 * 100))

Variance explained by PC1: 62.01%
Variance explained by PC2: 24.74%


In [14]:
# Step 7: Convert PCA results back to a DataFrame if needed
principal_df = pd.DataFrame(principal_components, columns=['PC1', 'PC2'])
final_df = pd.concat([data.iloc[:, 0], principal_df], axis=1)

In [16]:
final_df.head()

Unnamed: 0.1,Unnamed: 0,PC1,PC2
0,Alabama,0.985566,1.133392
1,Alaska,1.950138,1.073213
2,Arizona,1.763164,-0.745957
3,Arkansas,-0.14142,1.119797
4,California,2.52398,-1.542934


In [17]:
# Print the final DataFrame
print("\nFinal DataFrame with PCA results:\n", final_df.head())


Final DataFrame with PCA results:
    Unnamed: 0       PC1       PC2
0     Alabama  0.985566  1.133392
1      Alaska  1.950138  1.073213
2     Arizona  1.763164 -0.745957
3    Arkansas -0.141420  1.119797
4  California  2.523980 -1.542934
