In [None]:
import numpy as np

# Load the specified columns (1-indexed: 2nd, 4th, and 6th) from the CSV file
data = np.loadtxt('your_file.csv', delimiter=',', skiprows=1, usecols=(1, 3, 5))

print(data)

In [None]:
# Process the data to extract meaningful information
# Each row contains: [number of participants, average age of acquisition, frequency count]
for row in data:
    participants = int(row[0])
    avg_age = row[1]
    frequency = int(row[2])
    print(f"Participants: {participants}, Average Age of Acquisition: {avg_age}, Frequency: {frequency}")

In [None]:
# Check the shape of the array
print("Shape of the array:", data.shape)

# Check the data type of the array
print("Data type of the array:", data.dtype)

# Calculate basic statistics
print("Minimum value in each column:", np.min(data, axis=0))
print("Maximum value in each column:", np.max(data, axis=0))
print("Mean value in each column:", np.mean(data, axis=0))
print("Standard deviation in each column:", np.std(data, axis=0))

In [None]:
# Filter out rows with NaN values
filtered_data = data[~np.isnan(data).any(axis=1)]

print("Filtered data (rows without NaN values):")
print(filtered_data)

In [None]:
# Recalculate summary statistics for filtered data
min_values = np.min(filtered_data, axis=0)
max_values = np.max(filtered_data, axis=0)
mean_values = np.mean(filtered_data, axis=0)
std_dev_values = np.std(filtered_data, axis=0)

# Print the summary statistics
print("Summary Statistics for Filtered Data:")
print(f"Minimum values: {min_values}")
print(f"Maximum values: {max_values}")
print(f"Mean values: {mean_values}")
print(f"Standard deviation: {std_dev_values}")

# Check for potential surprises (e.g., unusually high/low values)
threshold = 2 * std_dev_values  # Example threshold for anomalies
anomalies = np.any((filtered_data < (mean_values - threshold)) | (filtered_data > (mean_values + threshold)), axis=0)

print("\nPotential anomalies detected in columns:")
for i, anomaly in enumerate(anomalies):
    if anomaly:
        print(f"Column {i + 1} has potential anomalies.")

In [None]:
# Normalize the last column of the array to turn raw frequencies into relative word frequencies
filtered_data[:, -1] /= np.sum(filtered_data[:, -1])

print("Normalized data (last column as relative frequencies):")
print(filtered_data)

In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

# Plot a smooth density estimate of the relative word frequencies
sns.kdeplot(filtered_data[:, -1], fill=True)
plt.title("Density Estimate of Relative Word Frequencies")
plt.xlabel("Relative Word Frequency")
plt.ylabel("Density")
plt.show()

# Plot the logarithms of the raw frequencies
log_frequencies = np.log(filtered_data[:, -1])
sns.kdeplot(log_frequencies, fill=True)
plt.title("Density Estimate of Logarithms of Raw Frequencies")
plt.xlabel("Logarithm of Raw Frequency")
plt.ylabel("Density")
plt.show()

# Observations
print("Notice if the logarithmic plot shows unusual clustering or gaps, which might indicate issues in the data.")