In [2]:
#1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Note: matplotlib & seaborn removed due to environment limitations

In [3]:
#2. Load Dataset
data = pd.read_csv("data/austin_weather.csv")
print("Dataset Loaded Successfully!\n")
data.head()

Dataset Loaded Successfully!



Unnamed: 0,Date,TempHighF,TempAvgF,TempLowF,DewPointHighF,DewPointAvgF,DewPointLowF,HumidityHighPercent,HumidityAvgPercent,HumidityLowPercent,...,SeaLevelPressureAvgInches,SeaLevelPressureLowInches,VisibilityHighMiles,VisibilityAvgMiles,VisibilityLowMiles,WindHighMPH,WindAvgMPH,WindGustMPH,PrecipitationSumInches,Events
0,2013-12-21,74,60,45,67,49,43,93,75,57,...,29.68,29.59,10,7,2,20,4,31,0.46,"Rain , Thunderstorm"
1,2013-12-22,56,48,39,43,36,28,93,68,43,...,30.13,29.87,10,10,5,16,6,25,0,
2,2013-12-23,58,45,32,31,27,23,76,52,27,...,30.49,30.41,10,10,10,8,3,12,0,
3,2013-12-24,61,46,31,36,28,21,89,56,22,...,30.45,30.3,10,10,7,12,4,20,0,
4,2013-12-25,58,50,41,44,40,36,86,71,56,...,30.33,30.27,10,10,7,10,2,16,T,


In [4]:

#3. Data Exploration
print("Dataset Info:")
data.info()

print("\nDataset Description:")
data.describe()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Date                        1319 non-null   object
 1   TempHighF                   1319 non-null   int64 
 2   TempAvgF                    1319 non-null   int64 
 3   TempLowF                    1319 non-null   int64 
 4   DewPointHighF               1319 non-null   object
 5   DewPointAvgF                1319 non-null   object
 6   DewPointLowF                1319 non-null   object
 7   HumidityHighPercent         1319 non-null   object
 8   HumidityAvgPercent          1319 non-null   object
 9   HumidityLowPercent          1319 non-null   object
 10  SeaLevelPressureHighInches  1319 non-null   object
 11  SeaLevelPressureAvgInches   1319 non-null   object
 12  SeaLevelPressureLowInches   1319 non-null   object
 13  VisibilityHighMiles         1319 n

Unnamed: 0,TempHighF,TempAvgF,TempLowF
count,1319.0,1319.0,1319.0
mean,80.862775,70.642911,59.902957
std,14.766523,14.045904,14.190648
min,32.0,29.0,19.0
25%,72.0,62.0,49.0
50%,83.0,73.0,63.0
75%,92.0,83.0,73.0
max,107.0,93.0,81.0


In [5]:
#4. Data Cleaning
# Replace 'T' (Trace) and '-' with 0 or NaN
data.replace({'T': 0, '-': 0}, inplace=True)

# Convert columns to numeric where applicable
cols_to_convert = ['PrecipitationSumInches', 'TempAvgF', 'HumidityAvgPercent', 'WindAvgMPH']
for col in cols_to_convert:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Drop rows with any NaN values
data.dropna(inplace=True)

print("\nCleaned Dataset Info:")
data.info()



Cleaned Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Date                        1319 non-null   object 
 1   TempHighF                   1319 non-null   int64  
 2   TempAvgF                    1319 non-null   int64  
 3   TempLowF                    1319 non-null   int64  
 4   DewPointHighF               1319 non-null   object 
 5   DewPointAvgF                1319 non-null   object 
 6   DewPointLowF                1319 non-null   object 
 7   HumidityHighPercent         1319 non-null   object 
 8   HumidityAvgPercent          1319 non-null   int64  
 9   HumidityLowPercent          1319 non-null   object 
 10  SeaLevelPressureHighInches  1319 non-null   object 
 11  SeaLevelPressureAvgInches   1319 non-null   object 
 12  SeaLevelPressureLowInches   1319 non-null   object 
 13  Visibility

In [6]:
#5. Feature Selection
features = ['TempAvgF', 'HumidityAvgPercent', 'WindAvgMPH']
target = 'PrecipitationSumInches'

X = data[features]
y = data[target]


In [7]:
#6. Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
#7. Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

In [10]:
#8. Model Evaluation
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"\nModel Evaluation:")
print(f"R2 Score: {r2:.4f}")
print(f"Mean Squared Error: {mse:.4f}")



Model Evaluation:
R2 Score: 0.0980
Mean Squared Error: 0.1070


In [11]:
#9. Visualization (using pandas built-in + text summaries)
print("\nGenerating correlation matrix and basic scatter summaries...")

# Only numeric columns to avoid ValueError
correlation_matrix = data.select_dtypes(include=[np.number]).corr()
print("\nCorrelation Matrix:\n")
print(correlation_matrix)

# Scatter plot summaries
def scatter_summary(x, y):
    correlation = np.corrcoef(data[x], data[y])[0,1]
    print(f"Correlation between {x} and {y}: {correlation:.4f}")

scatter_summary('TempAvgF', 'PrecipitationSumInches')
scatter_summary('HumidityAvgPercent', 'PrecipitationSumInches')
scatter_summary('WindAvgMPH', 'PrecipitationSumInches')


Generating correlation matrix and basic scatter summaries...

Correlation Matrix:

                        TempHighF  TempAvgF  TempLowF  HumidityAvgPercent  \
TempHighF                1.000000  0.970655  0.881977           -0.116141   
TempAvgF                 0.970655  1.000000  0.968573            0.022763   
TempLowF                 0.881977  0.968573  1.000000            0.165002   
HumidityAvgPercent      -0.116141  0.022763  0.165002            1.000000   
WindAvgMPH              -0.009297  0.034267  0.076016           -0.000472   
PrecipitationSumInches  -0.069869 -0.020442  0.034315            0.341382   

                        WindAvgMPH  PrecipitationSumInches  
TempHighF                -0.009297               -0.069869  
TempAvgF                  0.034267               -0.020442  
TempLowF                  0.076016                0.034315  
HumidityAvgPercent       -0.000472                0.341382  
WindAvgMPH                1.000000                0.033660  
Precipitat

In [12]:

#10. Conclusion
print("\n\u2705 Key Insights:")
print("- Humidity shows a stronger correlation with precipitation.")
print("- Temperature and wind speed have weaker relationships with precipitation.")
print("- The Linear Regression model shows moderate predictive ability (check R² score).\n")

print("Project Completed!")



✅ Key Insights:
- Humidity shows a stronger correlation with precipitation.
- Temperature and wind speed have weaker relationships with precipitation.
- The Linear Regression model shows moderate predictive ability (check R² score).

Project Completed!
