# Analysis for China wage vs. education data

#### Since R cannot be ran on normal VS code, I had to install visual studio code insider for it!

## Datasets used

From Dropbox: China-Education-Wages -> Data -> CFPS Data 2010-2016

Downloaded:
- 2010 Egnlish -> ecfps2010adult_112014.dta     
  - Renamed 2010adult.dta
- 2012 Egnlish -> ecfps2012adultcombine…015.dta     
  - Renamed 2012adult.dta
- 2014 English -> ecfps2014adult_170630.dta     
  - Renamed 2014adult.dta
- Mincer16 -> Mincer16.csv 

All 4 datasets were put within one folder named $CFPSdata$ in the same layer as this jupyter notebook.

In [1]:
library(knitr)
library(xtable)
library(broom)
library(dplyr)
library(tidyverse)
library(ggplot2)
library(stargazer)
library(lubridate)
library(haven)
library(ineq)
library(PerformanceAnalytics)
library(gglorenz)


Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.3.0 --

[32mv[39m [34mggplot2[39m 3.3.2     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.0.4     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mtidyr  [39m 1.1.2     [32mv[39m [34mforcats[39m 0.5.0
[32mv[39m [34mreadr  [39m 1.4.0     

-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Please cite as: 


 Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

 R package version 5.2.2. https://CRAN.R-project.org/package=stargazer 



Att

## Manipulating data

### Read Data

In [39]:
data10 <- read_dta("./CFPSdata/2010adult.dta")
data10 <- filter(data10, !qc1 < 0)
data12 <- read_dta("./CFPSdata/2012adult.dta")
data12 <- filter(data12, !sch2012 < 0)
data14 <- read_dta("./CFPSdata/2014adult.dta")
data14 <- filter(data14, !pw1r < 0)
data16 <- read.csv("./CFPSdata/Mincer16.csv")

### Adding education and year dummies to the data, then combining them into one dataframe

In [9]:
pinc <- function(df) {
    categorize <- 
        df %>%
        group_by(rinc) %>%
        tally()
    categorize = categorize[order(categorize$rinc), ]
    key <- categorize$rinc
    pinc <- c()
    sum_so_far <- 0
    total <- nrow(df)
    categorize$pinc <- 0
    for (row in seq_len(nrow(categorize))) {
      sum_so_far <- sum_so_far + categorize[row, 'n']
      categorize[row, 'pinc'] <- (sum_so_far - (0.5 * categorize[row, 'n'])) / total
    }
    ordered_combined <- df[order(df$rinc), ]
    ordered_combined$pinc <- 0
    current_rinc <- 0
    current_pinc <- 0
    current_row <- 0
    for (row in seq_len(nrow(ordered_combined))) {
      rinc <- ordered_combined[row, 'rinc']
      if (rinc != current_rinc) {
        current_row <- current_row + 1
        current_rinc <- rinc
        current_pinc <- categorize[current_row, 'pinc']
      }
      ordered_combined[row, 'pinc'] <- current_pinc
    }
    ordered_combined$lnpinc <- log(ordered_combined$pinc)
    return(ordered_combined)
}

In [11]:
mincer10 = data.frame(
  income = data10$income, 
  age = data10$qa1age,
  gender = data10$gender,
  urban = data10$urban,
  prov = data10$provcd,
  ethnic = data10$qa5code,
  married = 0,
  party = 0,
  postsecondary = 0,
  seniorsecondary = 0, 
  juniorsecondary = 0,
  primary = 0, 
  illiterate = 0,
  y10 = 1,
  y12 = 0,
  y14 = 0,
  y16 = 0,
  Shanghai = 0,
  Liaoning = 0,
  Guangdong = 0,
  Gansu = 0,
  Others = 0)

for (row in 1:nrow(data10)) {
  marriage = data10[row, "qe1"]
  if (marriage == 2) {
    mincer10[row, "married"] <- 1
  }
  party = data10[row, "qa7_s_1"]
  if (party == 1) {
    mincer10[row, "party"] <- 1
  }
  edu = data10[row, "qc1"]
  if (edu == 1) {
    mincer10[row, "illiterate"] <- 1
  } else if (edu == 2) {
    mincer10[row, "primary"] <- 1
  } else if (edu == 3) {
    mincer10[row, "juniorsecondary"] <- 1
  } else if (edu == 4) {
    mincer10[row, "seniorsecondary"] <- 1
  } else if (edu > 4) {
    mincer10[row, "postsecondary"] <- 1
  }
  prov = mincer10[row, "prov"]
  if (prov == 21) {
    mincer10[row, "Liaoning"] <- 1
  } else if (prov == 31) {
    mincer10[row, "Shanghai"] <- 1
  } else if (prov == 44) {
    mincer10[row, "Guangdong"] <- 1
  } else if (prov == 62) {
    mincer10[row, "Gansu"] <- 1
  } else {
    mincer10[row, "Others"] <- 1
  }
}
mincer10 = filter(mincer10, !is.na(income) & income >= 0)

mincer10_urban = filter(mincer10, urban == 1)
mincer10_rural = filter(mincer10, urban == 0)
mincer10_female_urban = filter(mincer10_urban, gender == 1)
mincer10_female_rural = filter(mincer10_rural, gender == 1)
mincer10_male_urban = filter(mincer10_urban, gender == 0)
mincer10_male_rural = filter(mincer10_rural, gender == 0)

mincer10$rinc = mincer10$income / mean(mincer10$income)

mincer10_urban$rinc = mincer10_urban$income / mean(mincer10_urban$income)
mincer10_rural$rinc = mincer10_rural$income / mean(mincer10_rural$income)
mincer10_female_urban$rinc = mincer10_female_urban$income / mean(mincer10_female_urban$income)
mincer10_female_rural$rinc = mincer10_female_rural$income / mean(mincer10_female_rural$income)
mincer10_male_urban$rinc = mincer10_male_urban$income / mean(mincer10_male_urban$income)
mincer10_male_rural$rinc = mincer10_male_rural$income / mean(mincer10_male_rural$income)

In [12]:
mincer10 <- pinc(mincer10)

mincer10_urban <- pinc(mincer10_urban)

mincer10_rural <- pinc(mincer10_rural)

mincer10_female_urban <- pinc(mincer10_female_urban)

mincer10_female_rural <- pinc(mincer10_female_rural)

mincer10_male_urban <- pinc(mincer10_male_urban)

mincer10_male_rural <- pinc(mincer10_male_rural)

In [16]:
mincer12 <- data.frame(
  income = data12$income,
  age = data12$cfps2012_age,
  gender = data12$cfps2012_gender,
  urban = data12$urban12,
  prov = data12$provcd,
  ethnic = data12$qa701code,
  married = 0,
  party = data12$sn401,
  postsecondary = 0,
  seniorsecondary = 0,
  juniorsecondary = 0,
  primary = 0,
  illiterate = 0,
  y10 = 0,
  y12 = 1,
  y14 = 0,
  y16 = 0,
  Shanghai = 0,
  Liaoning = 0,
  Guangdong = 0,
  Gansu = 0,
  Others = 0)

for (row in seq_len(nrow(data12))) {
  marriage <- data12[row, "qe104"]
  if (marriage == 2) {
    mincer12[row, "married"] <- 1
  }
  edu <- data12[row, "sch2012"]
  if (edu == 1) {
    mincer12[row, "illiterate"] <- 1
  } else if (edu == 2) {
    mincer12[row, "primary"] <- 1
  } else if (edu == 3) {
    mincer12[row, "juniorsecondary"] <- 1
  } else if (edu == 4) {
    mincer12[row, "seniorsecondary"] <- 1
  } else if (edu > 4) {
    mincer12[row, "postsecondary"] <- 1
  }
  prov = mincer12[row, "prov"]
  if (prov == 21) {
    mincer12[row, "Liaoning"] <- 1
  } else if (prov == 31) {
    mincer12[row, "Shanghai"] <- 1
  } else if (prov == 44) {
    mincer12[row, "Guangdong"] <- 1
  } else if (prov == 62) {
    mincer12[row, "Gansu"] <- 1
  } else {
    mincer12[row, "Others"] <- 1
  }
}
mincer12 <- filter(mincer12, !is.na(income))

mincer12_urban <- filter(mincer12, urban == 1)
mincer12_rural <- filter(mincer12, urban == 0)
mincer12_female_urban <- filter(mincer12_urban, gender == 1)
mincer12_female_rural <- filter(mincer12_rural, gender == 1)
mincer12_male_urban <- filter(mincer12_urban, gender == 0)
mincer12_male_rural <- filter(mincer12_rural, gender == 0)

mincer12$rinc <- mincer12$income / mean(mincer12$income)

mincer12_urban$rinc <- mincer12_urban$income / mean(mincer12_urban$income)
mincer12_rural$rinc <- mincer12_rural$income / mean(mincer12_rural$income)
mincer12_female_urban$rinc <- mincer12_female_urban$income / mean(mincer12_female_urban$income)
mincer12_female_rural$rinc <- mincer12_female_rural$income / mean(mincer12_female_rural$income)
mincer12_male_urban$rinc <- mincer12_male_urban$income / mean(mincer12_male_urban$income)
mincer12_male_rural$rinc <- mincer12_male_rural$income / mean(mincer12_male_rural$income)

In [18]:
mincer12 <- pinc(mincer12)

mincer12_urban <- pinc(mincer12_urban)

mincer12_rural <- pinc(mincer12_rural)

mincer12_female_urban <- pinc(mincer12_female_urban)

mincer12_female_rural <- pinc(mincer12_female_rural)

mincer12_male_urban <- pinc(mincer12_male_urban)

mincer12_male_rural <- pinc(mincer12_male_rural)

In [25]:
mincer14 <- data.frame(
  income = data14$p_income,
  age = data14$cfps2014_age,
  gender = data14$cfps_gender,
  urban = data14$urban14,
  prov = data14$provcd14,
  ethnic = data14$cfps_minzu,
  married = 0,
  party = data14$pn401a,
  postsecondary = 0,
  seniorsecondary = 0,
  juniorsecondary = 0,
  primary = 0,
  illiterate = 0,
  y10 = 0,
  y12 = 0,
  y14 = 1,
  y16 = 0,
  Shanghai = 0,
  Liaoning = 0,
  Guangdong = 0,
  Gansu = 0,
  Others = 0)

for (row in seq_len(nrow(data14))) {
  marriage <- data14[row, "qea0"]
  if (marriage == 2) {
    mincer14[row, "married"] <- 1
  }
  edu <- data14[row, "pw1r"]
  if (edu == 1) {
    mincer14[row, "illiterate"] <- 1
  } else if (edu == 2) {
    mincer14[row, "primary"] <- 1
  } else if (edu == 3) {
    mincer14[row, "juniorsecondary"] <- 1
  } else if (edu == 4) {
    mincer14[row, "seniorsecondary"] <- 1
  } else if (edu > 4) {
    mincer14[row, "postsecondary"] <- 1
  }
  prov <- mincer14[row, "prov"]
  if (prov == 21) {
    mincer14[row, "Liaoning"] <- 1
  } else if (prov == 31) {
    mincer14[row, "Shanghai"] <- 1
  } else if (prov == 44) {
    mincer14[row, "Guangdong"] <- 1
  } else if (prov == 62) {
    mincer14[row, "Gansu"] <- 1
  } else {
    mincer14[row, "Others"] <- 1
  }
}
mincer14 <- filter(mincer14, !is.na(income))

mincer14_urban <- filter(mincer14, urban == 1)
mincer14_rural <- filter(mincer14, urban == 0)
mincer14_female_urban <- filter(mincer14_urban, gender == 1)
mincer14_female_rural <- filter(mincer14_rural, gender == 1)
mincer14_male_urban <- filter(mincer14_urban, gender == 0)
mincer14_male_rural <- filter(mincer14_rural, gender == 0)

mincer14$rinc <- mincer14$income / mean(mincer14$income)

mincer14_urban$rinc <- mincer14_urban$income / mean(mincer14_urban$income)
mincer14_rural$rinc <- mincer14_rural$income / mean(mincer14_rural$income)
mincer14_female_urban$rinc <- mincer14_female_urban$income / mean(mincer14_female_urban$income)
mincer14_female_rural$rinc <- mincer14_female_rural$income / mean(mincer14_female_rural$income)
mincer14_male_urban$rinc <- mincer14_male_urban$income / mean(mincer14_male_urban$income)
mincer14_male_rural$rinc <- mincer14_male_rural$income / mean(mincer14_male_rural$income)

In [31]:
print(nrow(data14))

[1] 1670


In [20]:
mincer14 <- pinc(mincer14)

mincer14_urban <- pinc(mincer14_urban)

mincer14_rural <- pinc(mincer14_rural)

mincer14_female_urban <- pinc(mincer14_female_urban)

mincer14_female_rural <- pinc(mincer14_female_rural)

mincer14_male_urban <- pinc(mincer14_male_urban)

mincer14_male_rural <- pinc(mincer14_male_rural)

In [21]:
mincer16 <- data.frame(
  income = data16$income,
  age = data16$age,
  gender = data16$gender,
  urban = data16$urban16,
  prov = data16$provcd16,
  ethnic = data16$ethnic,
  married = data16$married,
  party = data16$party,
  postsecondary = data16$postsecondary,
  seniorsecondary = data16$seniorsecondary,
  juniorsecondary = data16$juniorsecondary,
  primary = data16$primary,
  illiterate = data16$illiterate,
  y10 = 0,
  y12 = 0,
  y14 = 0,
  y16 = 1)

for (row in seq_len(nrow(data16))) {
  prov <- mincer16[row, "prov"]
  if (prov == 21) {
    mincer16[row, "Liaoning"] <- 1
  } else if (prov == 31) {
    mincer16[row, "Shanghai"] <- 1
  } else if (prov == 44) {
    mincer16[row, "Guangdong"] <- 1
  } else if (prov == 62) {
    mincer16[row, "Gansu"] <- 1
  } else {
    mincer16[row, "Others"] <- 1
  }
}
mincer16 <- filter(mincer16, !is.na(income))

mincer16_urban <- filter(mincer16, urban == 1)
mincer16_rural <- filter(mincer16, urban == 0)
mincer16_female_urban <- filter(mincer16_urban, gender == 1)
mincer16_female_rural <- filter(mincer16_rural, gender == 1)
mincer16_male_urban <- filter(mincer16_urban, gender == 0)
mincer16_male_rural <- filter(mincer16_rural, gender == 0)

mincer16$rinc <- mincer16$income / mean(mincer16$income)

mincer16_urban$rinc <- mincer16_urban$income / mean(mincer16_urban$income)
mincer16_rural$rinc <- mincer16_rural$income / mean(mincer16_rural$income)
mincer16_female_urban$rinc <- mincer16_female_urban$income / mean(mincer16_female_urban$income)
mincer16_female_rural$rinc <- mincer16_female_rural$income / mean(mincer16_female_rural$income)
mincer16_male_urban$rinc <- mincer16_male_urban$income / mean(mincer16_male_urban$income)
mincer16_male_rural$rinc <- mincer16_male_rural$income / mean(mincer16_male_rural$income)

In [22]:
mincer16 <- pinc(mincer16)

mincer16_urban <- pinc(mincer16_urban)

mincer16_rural <- pinc(mincer16_rural)

mincer16_female_urban <- pinc(mincer16_female_urban)

mincer16_female_rural <- pinc(mincer16_female_rural)

mincer16_male_urban <- pinc(mincer16_male_urban)

mincer16_male_rural <- pinc(mincer16_male_rural)

In [23]:
combined <- full_join(mincer10, full_join(mincer12, full_join(mincer14, mincer16)))
combined$lninc <- log(combined$income)
combined$lnrinc <- log(combined$rinc)

urban <- full_join(mincer10_urban, full_join(mincer12_urban, full_join(mincer14_urban, mincer16_urban)))
urban$lninc <- log(urban$income)
urban$lnrinc <- log(urban$rinc)


rural <- full_join(mincer10_rural, full_join(mincer12_rural, full_join(mincer14_rural, mincer16_rural)))
rural$lninc <- log(rural$income)
rural$lnrinc <- log(rural$rinc)


female_urban <- full_join(mincer10_female_urban, full_join(mincer12_female_urban, full_join(mincer14_female_urban, mincer16_female_urban)))
female_urban$lninc <- log(female_urban$income)
female_urban$lnrinc <- log(female_urban$rinc)


female_rural <- full_join(mincer10_female_rural, full_join(mincer12_female_rural, full_join(mincer14_female_rural, mincer16_female_rural)))
female_rural$lninc <- log(female_rural$income)
female_rural$lnrinc <- log(female_rural$rinc)


male_urban <- full_join(mincer10_male_urban, full_join(mincer12_male_urban, full_join(mincer14_male_urban, mincer16_male_urban)))
male_urban$lninc <- log(male_urban$income)
male_urban$lnrinc <- log(male_urban$rinc)

male_rural <- full_join(mincer10_male_rural, full_join(mincer12_male_rural, full_join(mincer14_male_rural, mincer16_male_rural)))
male_rural$lninc <- log(male_rural$income)
male_rural$lnrinc <- log(male_rural$rinc)

Joining, by = c("income", "age", "gender", "urban", "prov", "ethnic", "married", "party", "postsecondary", "seniorsecondary", "juniorsecondary", "primary", "illiterate", "y10", "y12", "y14", "y16", "Shanghai", "Liaoning", "Guangdong", "Gansu", "Others", "rinc", "pinc", "lnpinc")

Joining, by = c("income", "age", "gender", "urban", "prov", "ethnic", "married", "party", "postsecondary", "seniorsecondary", "juniorsecondary", "primary", "illiterate", "y10", "y12", "y14", "y16", "Shanghai", "Liaoning", "Guangdong", "Gansu", "Others", "rinc", "pinc", "lnpinc")

Joining, by = c("income", "age", "gender", "urban", "prov", "ethnic", "married", "party", "postsecondary", "seniorsecondary", "juniorsecondary", "primary", "illiterate", "y10", "y12", "y14", "y16", "Shanghai", "Liaoning", "Guangdong", "Gansu", "Others", "rinc", "pinc", "lnpinc")

"NaNs produced"
"NaNs produced"
Joining, by = c("income", "age", "gender", "urban", "prov", "ethnic", "married", "party", "postsecondary", "seniorsecondary",

### Computing Lorenz curves for combined, male, and female for each year

In [None]:
lorenz_combined <- filter(combined, income >= 0)

lorenz_combined$year_str[lorenz_combined$y10 == 1] <- "Year 2010"
lorenz_combined$year_str[lorenz_combined$y12 == 1] <- "Year 2012"
lorenz_combined$year_str[lorenz_combined$y14 == 1] <- "Year 2014"
lorenz_combined$year_str[lorenz_combined$y16 == 1] <- "Year 2016"

lorenz_male <- filter(lorenz_combined, gender == 0)
lorenz_female <- filter(lorenz_combined, gender == 1)

ggplot(lorenz_combined, aes(x = income, colour = year_str)) +
  stat_lorenz() +
  geom_abline(color = "grey") +
  labs(x = "Cumulative percentages of all income",
       y = "Cumulative percentage of percentage of people with that income",
       title = "Lorenz curves for combined data")

ggsave("LorenzCurves/Lorenz_curve_combined.png")

ggplot(lorenz_male, aes(x = income, colour = year_str)) +
  stat_lorenz() +
  geom_abline(color = "grey") +
  labs(x = "Cumulative percentages of all income",
       y = "Cumulative percentage of percentage of people with that income",
       title = "Lorenz curves for male data")

ggsave("LorenzCurves/Lorenz_curve_male.png")

ggplot(lorenz_female, aes(x = income, colour = year_str)) +
  stat_lorenz() +
  geom_abline(color = "grey") +
  labs(x = "Cumulative percentages of all income",
       y = "Cumulative percentage of percentage of people with that income",
       title = "Lorenz curves for female data")

ggsave("LorenzCurves/Lorenz_curve_female.png")

In [27]:
sprintf("Combined: %f, %f, %f, %f", Gini(mincer10$inc), Gini(mincer12$inc), Gini(mincer14$inc), Gini(mincer16$inc))
sprintf("Urban: %f, %f, %f, %f", Gini(mincer10_urban$inc), Gini(mincer12_urban$inc), Gini(mincer14_urban$inc), Gini(mincer16_urban$inc))
sprintf("Rural: %f, %f, %f, %f", Gini(mincer10_rural$inc), Gini(mincer12_rural$inc), Gini(mincer14_rural$inc), Gini(mincer16_rural$inc))
sprintf("Female Urban: %f, %f, %f, %f", Gini(mincer10_female_urban$inc), Gini(mincer12_female_urban$inc), Gini(mincer14_female_urban$inc), Gini(mincer16_female_urban$inc))
sprintf("Female Rural: %f, %f, %f, %f", Gini(mincer10_female_rural$inc), Gini(mincer12_female_rural$inc), Gini(mincer14_female_rural$inc), Gini(mincer16_female_rural$inc))
sprintf("Male Urban: %f, %f, %f, %f", Gini(mincer10_male_urban$inc), Gini(mincer12_male_urban$inc), Gini(mincer14_male_urban$inc), Gini(mincer16_male_urban$inc))
sprintf("male Rural: %f, %f, %f, %f", Gini(mincer10_male_rural$inc), Gini(mincer12_male_rural$inc), Gini(mincer14_male_rural$inc), Gini(mincer16_male_rural$inc))

## Running the descriptive analysis for the all of the sub populations

### For combined data

In [None]:
hist(combined$lninc)
hist(urban$lninc)
hist(rural$lninc)
hist(female_urban$lninc)
hist(female_rural$lninc)
hist(male_urban$lninc)
hist(male_rural$lninc)


In [None]:
hist(combined$lnrinc)
hist(urban$lnrinc)
hist(rural$lnrinc)
hist(female_urban$lnrinc)
hist(female_rural$lnrinc)
hist(male_urban$lnrinc)
hist(male_rural$lnrinc)

In [None]:
hist(combined$lnpinc)
hist(urban$lnpinc)
hist(rural$lnpinc)
hist(female_urban$lnpinc)
hist(female_rural$lnpinc)
hist(male_urban$lnpinc)
hist(male_rural$lnpinc)

## Running the regressions for lninc, rinc, lnrinc, pinc, and lnpinc on combined data, and urban and rural subsamples. Does not contain married or party variables

### Regression for lninc, without province dummies

In [None]:
lnincReg <- filter(combined, !is.infinite(lninc) & !is.na(lninc))
summary(lm(lninc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + urban + y10 + y12 + y14, data=lnincReg))

urban_reg <- filter(urban, !is.infinite(lninc) & !is.na(lninc))
summary(lm(lninc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + y10 + y12 + y14, data=urban_reg))

rural_reg <- filter(rural, !is.infinite(lninc) & !is.na(lninc))
summary(lm(lninc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + y10 + y12 + y14, data=rural_reg))

female_urban_reg <- filter(female_urban, !is.infinite(lninc) & !is.na(lninc))
female_rural_reg <- filter(female_rural, !is.infinite(lninc) & !is.na(lninc))
male_urban_reg <- filter(male_urban, !is.infinite(lninc) & !is.na(lninc))
male_rural_reg <- filter(male_rural, !is.infinite(lninc) & !is.na(lninc))
summary(lm(lninc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14, data=female_urban_reg))
summary(lm(lninc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14, data=female_rural_reg))
summary(lm(lninc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14, data=male_urban_reg))
summary(lm(lninc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14, data=male_rural_reg))

### Regression for lninc, with province dummies

In [44]:
lnincReg <- filter(combined, !is.infinite(lninc) & !is.na(lninc))
summary(lm(lninc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + urban + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=lnincReg))

urban_reg <- filter(urban, !is.infinite(lninc) & !is.na(lninc))
summary(lm(lninc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=urban_reg))

rural_reg <- filter(rural, !is.infinite(lninc) & !is.na(lninc))
summary(lm(lninc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=rural_reg))

female_urban_reg <- filter(female_urban, !is.infinite(lninc) & !is.na(lninc))
female_rural_reg <- filter(female_rural, !is.infinite(lninc) & !is.na(lninc))
male_urban_reg <- filter(male_urban, !is.infinite(lninc) & !is.na(lninc))
male_rural_reg <- filter(male_rural, !is.infinite(lninc) & !is.na(lninc))
summary(lm(lninc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=female_urban_reg))
summary(lm(lninc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=female_rural_reg))
summary(lm(lninc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=male_urban_reg))
summary(lm(lninc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=male_rural_reg))


Call:
lm(formula = lninc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + gender + urban + y10 + y12 + y14 + Liaoning + Guangdong + 
    Gansu + Others, data = lnincReg)

Residuals:
     Min       1Q   Median       3Q      Max 
-10.4413  -0.5365   0.2398   0.8335   4.8801 

Coefficients: (1 not defined because of singularities)
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      8.145087   0.056810 143.375  < 2e-16 ***
postsecondary    1.694561   0.026138  64.830  < 2e-16 ***
seniorsecondary  1.242750   0.023394  53.124  < 2e-16 ***
juniorsecondary  1.050356   0.019855  52.900  < 2e-16 ***
primary          0.606428   0.021845  27.760  < 2e-16 ***
gender           0.291400   0.009835  29.629  < 2e-16 ***
urban           -0.009248   0.006084  -1.520    0.129    
y10              0.319483   0.052349   6.103 1.05e-09 ***
y12              0.949174   0.052463  18.092  < 2e-16 ***
y14                    NA         NA      NA       NA    
Liaoning   


Call:
lm(formula = lninc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + gender + y10 + y12 + y14 + Liaoning + Guangdong + 
    Gansu + Others, data = urban_reg)

Residuals:
     Min       1Q   Median       3Q      Max 
-10.4171  -0.3944   0.1902   0.6844   4.2934 

Coefficients: (1 not defined because of singularities)
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)      7.83100    0.08182   95.71   <2e-16 ***
postsecondary    1.49788    0.03193   46.91   <2e-16 ***
seniorsecondary  1.05889    0.03071   34.48   <2e-16 ***
juniorsecondary  0.91739    0.02861   32.06   <2e-16 ***
primary          0.56727    0.03317   17.10   <2e-16 ***
gender           0.25354    0.01279   19.83   <2e-16 ***
y10              0.83469    0.07687   10.86   <2e-16 ***
y12              1.36968    0.07736   17.70   <2e-16 ***
y14                   NA         NA      NA       NA    
Liaoning        -0.47441    0.03460  -13.71   <2e-16 ***
Guangdong       -0.39571    0


Call:
lm(formula = lninc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + gender + y10 + y12 + y14 + Liaoning + Guangdong + 
    Gansu + Others, data = rural_reg)

Residuals:
    Min      1Q  Median      3Q     Max 
-9.6190 -0.6693  0.2609  0.9180  5.6169 

Coefficients: (1 not defined because of singularities)
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)      7.12998    0.10744   66.36   <2e-16 ***
postsecondary    1.34657    0.05539   24.31   <2e-16 ***
seniorsecondary  1.07236    0.03835   27.96   <2e-16 ***
juniorsecondary  0.97200    0.02786   34.89   <2e-16 ***
primary          0.57485    0.02891   19.88   <2e-16 ***
gender           0.36897    0.01504   24.54   <2e-16 ***
y10              1.23016    0.09126   13.48   <2e-16 ***
y12              1.95853    0.09224   21.23   <2e-16 ***
y14                   NA         NA      NA       NA    
Liaoning        -1.00822    0.06684  -15.08   <2e-16 ***
Guangdong       -0.81053    0.06718  -1


Call:
lm(formula = lninc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + 
    Others, data = female_urban_reg)

Residuals:
     Min       1Q   Median       3Q      Max 
-10.4059  -0.3756   0.1628   0.6242   4.0815 

Coefficients: (1 not defined because of singularities)
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)      8.63406    0.10115  85.358  < 2e-16 ***
postsecondary    1.23359    0.04293  28.737  < 2e-16 ***
seniorsecondary  0.79798    0.04146  19.248  < 2e-16 ***
juniorsecondary  0.75727    0.03905  19.390  < 2e-16 ***
primary          0.37326    0.04415   8.454  < 2e-16 ***
y10              0.53824    0.09204   5.848 5.12e-09 ***
y12              0.97696    0.09247  10.565  < 2e-16 ***
y14                   NA         NA      NA       NA    
Liaoning        -0.43334    0.04340  -9.985  < 2e-16 ***
Guangdong       -0.37074    0.04243  -8.738  < 2e-16 ***
Gansu           -0.44016    0.0


Call:
lm(formula = lninc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + 
    Others, data = female_rural_reg)

Residuals:
    Min      1Q  Median      3Q     Max 
-9.6181 -0.5310  0.2357  0.8558  4.6663 

Coefficients: (1 not defined because of singularities)
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)      7.83398    0.13522  57.935   <2e-16 ***
postsecondary    1.15510    0.06571  17.580   <2e-16 ***
seniorsecondary  0.96420    0.04488  21.484   <2e-16 ***
juniorsecondary  0.86766    0.03454  25.120   <2e-16 ***
primary          0.47848    0.03634  13.165   <2e-16 ***
y10              0.96213    0.11504   8.363   <2e-16 ***
y12              1.65433    0.11577  14.290   <2e-16 ***
y14                   NA         NA      NA       NA    
Liaoning        -0.77162    0.08131  -9.490   <2e-16 ***
Guangdong       -0.73784    0.08229  -8.967   <2e-16 ***
Gansu           -0.96989    0.07490 -12.9


Call:
lm(formula = lninc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + 
    Others, data = male_urban_reg)

Residuals:
     Min       1Q   Median       3Q      Max 
-10.1624  -0.3962   0.2283   0.7312   4.0713 

Coefficients: (1 not defined because of singularities)
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)      7.27520    0.13120  55.451   <2e-16 ***
postsecondary    1.65049    0.04742  34.804   <2e-16 ***
seniorsecondary  1.20218    0.04556  26.390   <2e-16 ***
juniorsecondary  0.91600    0.04207  21.771   <2e-16 ***
primary          0.62451    0.04986  12.525   <2e-16 ***
y10              1.23667    0.12656   9.772   <2e-16 ***
y12              1.83793    0.12764  14.399   <2e-16 ***
y14                   NA         NA      NA       NA    
Liaoning        -0.47487    0.05395  -8.803   <2e-16 ***
Guangdong       -0.43902    0.05322  -8.250   <2e-16 ***
Gansu           -0.81752    0.078


Call:
lm(formula = lninc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + 
    Others, data = male_rural_reg)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.9969 -0.7690  0.2868  1.0269  5.2679 

Coefficients: (1 not defined because of singularities)
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)      7.00045    0.16854  41.536   <2e-16 ***
postsecondary    1.48642    0.09477  15.684   <2e-16 ***
seniorsecondary  1.01037    0.06979  14.478   <2e-16 ***
juniorsecondary  0.89643    0.04651  19.273   <2e-16 ***
primary          0.52805    0.04683  11.277   <2e-16 ***
y10              1.45121    0.14275  10.166   <2e-16 ***
y12              2.12357    0.14555  14.590   <2e-16 ***
y14                   NA         NA      NA       NA    
Liaoning        -1.32110    0.10912 -12.106   <2e-16 ***
Guangdong       -0.97815    0.10910  -8.966   <2e-16 ***
Gansu           -1.49325    0.09913 -15.063

In [83]:
index <- c("combined", "urban", "rural", "female_urban", "female_rural", "male_urban", "male_rural")
mean_lninc <- c(mean(lnincReg$lninc), mean(urban_reg$lninc), mean(rural_reg$lninc), mean(female_urban_reg$lninc), mean(female_rural_reg$lninc), mean(male_urban_reg$lninc), mean(male_rural_reg$lninc))
median_lninc <- c(median(lnincReg$lninc), median(urban_reg$lninc), median(rural_reg$lninc), median(female_urban_reg$lninc), median(female_rural_reg$lninc), median(male_urban_reg$lninc), median(male_rural_reg$lninc))
var_lninc <- c(var(lnincReg$lninc), var(urban_reg$lninc), var(rural_reg$lninc), var(female_urban_reg$lninc), var(female_rural_reg$lninc), var(male_urban_reg$lninc), var(male_rural_reg$lninc))
kurtosis_lninc <- c(kurtosis(lnincReg$lninc), kurtosis(urban_reg$lninc), kurtosis(rural_reg$lninc), kurtosis(female_urban_reg$lninc), kurtosis(female_rural_reg$lninc), kurtosis(male_urban_reg$lninc), kurtosis(male_rural_reg$lninc))
skew_lninc <- c(skewness(lnincReg$lninc), skewness(urban_reg$lninc), skewness(rural_reg$lninc), skewness(female_urban_reg$lninc), skewness(female_rural_reg$lninc), skewness(male_urban_reg$lninc), skewness(male_rural_reg$lninc))

lninc <- data.frame("subsamples"=index, "mean" = mean_lninc, "median" = median_lninc, "var" = var_lninc, "kurtosis" = kurtosis_lninc, "skew" = skew_lninc)
lninc$stdev <- sqrt(lninc$var)

In [84]:
print("Lninc descriptive statistics")
print(lninc)

[1] "Lninc descriptive statistics"
    subsamples     mean   median      var kurtosis      skew    stdev
1     combined 9.061718 9.392662 2.511959 5.009130 -1.613858 1.584916
2        urban 9.468470 9.784704 1.923729 8.769189 -2.052226 1.386986
3        rural 8.621783 8.922658 2.748698 3.659157 -1.353109 1.657920
4 female_urban 9.714591 9.903488 1.506458 8.792632 -1.866556 1.227378
5 female_rural 9.000477 9.210340 2.116679 4.794995 -1.445970 1.454881
6   male_urban 9.180661 9.510445 2.259995 8.187224 -2.114384 1.503328
7   male_rural 8.096532 8.294050 3.157891 2.919475 -1.226031 1.777046


### Regression for rinc and lnrinc, without province dummies

In [None]:
summary(lm(rinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + urban + y10 + y12 + y14, data=combined))
summary(lm(rinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + y10 + y12 + y14, data=urban))
summary(lm(rinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + y10 + y12 + y14, data=rural))

lnrincReg <- filter(combined, !is.infinite(lnrinc) & !is.na(lnrinc))
urban_reg <- filter(urban, !is.infinite(lnrinc) & !is.na(lnrinc))
rural_reg <- filter(rural, !is.infinite(lnrinc) & !is.na(lnrinc))
summary(lm(lnrinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + urban + y10 + y12 + y14, data=lnrincReg))
summary(lm(lnrinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + y10 + y12 + y14, data=urban_reg))
summary(lm(lnrinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + y10 + y12 + y14, data=rural_reg))


female_urban_reg <- filter(female_urban, !is.infinite(lnrinc) & !is.na(lnrinc))
female_rural_reg <- filter(female_rural, !is.infinite(lnrinc) & !is.na(lnrinc))
male_urban_reg <- filter(male_urban, !is.infinite(lnrinc) & !is.na(lnrinc))
male_rural_reg <- filter(male_rural, !is.infinite(lnrinc) & !is.na(lnrinc))
summary(lm(lnrinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14, data=female_urban_reg))
summary(lm(lnrinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14, data=female_rural_reg))
summary(lm(lnrinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14, data=male_urban_reg))
summary(lm(lnrinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14, data=male_rural_reg))

In [86]:
index <- c("combined", "urban", "rural", "female_urban", "female_rural", "male_urban", "male_rural")
mean_lnrinc <- c(mean(lnrincReg$lnrinc), mean(urban_reg$lnrinc), mean(rural_reg$lnrinc), mean(female_urban_reg$lnrinc), mean(female_rural_reg$lnrinc), mean(male_urban_reg$lnrinc), mean(male_rural_reg$lnrinc))
median_lnrinc <- c(median(lnrincReg$lnrinc), median(urban_reg$lnrinc), median(rural_reg$lnrinc), median(female_urban_reg$lnrinc), median(female_rural_reg$lnrinc), median(male_urban_reg$lnrinc), median(male_rural_reg$lnrinc))
var_lnrinc <- c(var(lnincReg$lnrinc), var(urban_reg$lnrinc), var(rural_reg$lnrinc), var(female_urban_reg$lnrinc), var(female_rural_reg$lnrinc), var(male_urban_reg$lnrinc), var(male_rural_reg$lnrinc))
kurtosis_lnrinc <- c(kurtosis(lnrincReg$lnrinc), kurtosis(urban_reg$lnrinc), kurtosis(rural_reg$lnrinc), kurtosis(female_urban_reg$lnrinc), kurtosis(female_rural_reg$lnrinc), kurtosis(male_urban_reg$lnrinc), kurtosis(male_rural_reg$lnrinc))
skew_lnrinc <- c(skewness(lnrincReg$lnrinc), skewness(urban_reg$lnrinc), skewness(rural_reg$lnrinc), skewness(female_urban_reg$lnrinc), skewness(female_rural_reg$lnrinc), skewness(male_urban_reg$lnrinc), skewness(male_rural_reg$lnrinc))

lnrinc <- data.frame("subsamples"=index, "mean" = mean_lnrinc, "median" = median_lnrinc, "var" = var_lnrinc, "kurtosis" = kurtosis_lnrinc, "skew" = skew_lnrinc)
lnrinc$stdev <- sqrt(lnrinc$var)

In [None]:
print("Lnrinc descriptive statistics")
print(lnrinc)

### Regression for rinc and lnrinc, with province dummies

In [47]:
summary(lm(rinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + urban + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=combined))
summary(lm(rinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=urban))
summary(lm(rinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=rural))

summary(lm(lnrinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + urban + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=lnrincReg))
summary(lm(lnrinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=urban_reg))
summary(lm(lnrinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=rural_reg))

summary(lm(lnrinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=female_urban_reg))
summary(lm(lnrinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=female_rural_reg))
summary(lm(lnrinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=male_urban_reg))
summary(lm(lnrinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=male_rural_reg))


Call:
lm(formula = rinc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + gender + urban + y10 + y12 + y14 + Liaoning + Guangdong + 
    Gansu + Others, data = combined)

Residuals:
    Min      1Q  Median      3Q     Max 
 -4.018  -0.770  -0.241   0.295 271.661 

Coefficients: (1 not defined because of singularities)
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      1.458174   0.071350  20.437   <2e-16 ***
postsecondary    2.271554   0.035448  64.081   <2e-16 ***
seniorsecondary  0.933518   0.029724  31.406   <2e-16 ***
juniorsecondary  0.640470   0.024226  26.437   <2e-16 ***
primary          0.271922   0.026576  10.232   <2e-16 ***
gender           0.260747   0.011882  21.944   <2e-16 ***
urban            0.015897   0.007971   1.994   0.0461 *  
y10              0.011316   0.065094   0.174   0.8620    
y12             -0.055383   0.064594  -0.857   0.3912    
y14                    NA         NA      NA       NA    
Liaoning        -1.227


Call:
lm(formula = rinc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + gender + y10 + y12 + y14 + Liaoning + Guangdong + 
    Gansu + Others, data = urban)

Residuals:
    Min      1Q  Median      3Q     Max 
 -3.045  -0.657  -0.205   0.351 188.186 

Coefficients: (1 not defined because of singularities)
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)      1.16526    0.10523  11.073  < 2e-16 ***
postsecondary    1.74654    0.04373  39.941  < 2e-16 ***
seniorsecondary  0.68910    0.04036  17.074  < 2e-16 ***
juniorsecondary  0.47674    0.03671  12.988  < 2e-16 ***
primary          0.21492    0.04258   5.047 4.51e-07 ***
gender           0.27961    0.01748  16.001  < 2e-16 ***
y10             -0.14661    0.09913  -1.479    0.139    
y12             -0.15236    0.09926  -1.535    0.125    
y14                   NA         NA      NA       NA    
Liaoning        -0.82109    0.04966 -16.533  < 2e-16 ***
Guangdong       -0.66446    0.04854 -13.689 


Call:
lm(formula = rinc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + gender + y10 + y12 + y14 + Liaoning + Guangdong + 
    Gansu + Others, data = rural)

Residuals:
    Min      1Q  Median      3Q     Max 
 -3.926  -0.945  -0.391   0.196 121.305 

Coefficients: (1 not defined because of singularities)
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)      1.73235    0.11942  14.507   <2e-16 ***
postsecondary    2.07652    0.06760  30.720   <2e-16 ***
seniorsecondary  1.18822    0.04358  27.266   <2e-16 ***
juniorsecondary  0.92804    0.03042  30.509   <2e-16 ***
primary          0.39643    0.03151  12.582   <2e-16 ***
gender           0.29334    0.01504  19.502   <2e-16 ***
y10             -0.03269    0.09718  -0.336   0.7366    
y12             -0.17578    0.09721  -1.808   0.0706 .  
y14                   NA         NA      NA       NA    
Liaoning        -1.43364    0.08124 -17.648   <2e-16 ***
Guangdong       -1.31791    0.08262 -15.951 


Call:
lm(formula = lnrinc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + gender + urban + y10 + y12 + y14 + Liaoning + Guangdong + 
    Gansu + Others, data = lnrincReg)

Residuals:
     Min       1Q   Median       3Q      Max 
-10.4413  -0.5365   0.2398   0.8335   4.8801 

Coefficients: (1 not defined because of singularities)
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -1.198323   0.056810 -21.094   <2e-16 ***
postsecondary    1.694561   0.026138  64.830   <2e-16 ***
seniorsecondary  1.242750   0.023394  53.124   <2e-16 ***
juniorsecondary  1.050356   0.019855  52.900   <2e-16 ***
primary          0.606428   0.021845  27.760   <2e-16 ***
gender           0.291400   0.009835  29.629   <2e-16 ***
urban           -0.009248   0.006084  -1.520    0.129    
y10              0.440028   0.052349   8.406   <2e-16 ***
y12              0.985513   0.052463  18.785   <2e-16 ***
y14                    NA         NA      NA       NA    
Liaoning 


Call:
lm(formula = lnrinc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + gender + y10 + y12 + y14 + Liaoning + Guangdong + 
    Gansu + Others, data = urban_reg)

Residuals:
     Min       1Q   Median       3Q      Max 
-10.4171  -0.3944   0.1902   0.6844   4.2934 

Coefficients: (1 not defined because of singularities)
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -1.48278    0.08182 -18.122  < 2e-16 ***
postsecondary    1.49788    0.03193  46.909  < 2e-16 ***
seniorsecondary  1.05889    0.03071  34.483  < 2e-16 ***
juniorsecondary  0.91739    0.02861  32.061  < 2e-16 ***
primary          0.56727    0.03317  17.101  < 2e-16 ***
gender           0.25354    0.01279  19.829  < 2e-16 ***
y10              0.57900    0.07687   7.533 5.18e-14 ***
y12              1.01085    0.07736  13.067  < 2e-16 ***
y14                   NA         NA      NA       NA    
Liaoning        -0.47441    0.03460 -13.711  < 2e-16 ***
Guangdong       -0.39571    


Call:
lm(formula = lnrinc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + gender + y10 + y12 + y14 + Liaoning + Guangdong + 
    Gansu + Others, data = rural_reg)

Residuals:
    Min      1Q  Median      3Q     Max 
-9.6190 -0.6693  0.2609  0.9180  5.6169 

Coefficients: (1 not defined because of singularities)
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -1.51446    0.10744  -14.10   <2e-16 ***
postsecondary    1.34657    0.05539   24.31   <2e-16 ***
seniorsecondary  1.07236    0.03835   27.96   <2e-16 ***
juniorsecondary  0.97200    0.02786   34.89   <2e-16 ***
primary          0.57485    0.02891   19.88   <2e-16 ***
gender           0.36897    0.01504   24.54   <2e-16 ***
y10              1.09353    0.09126   11.98   <2e-16 ***
y12              1.76236    0.09224   19.11   <2e-16 ***
y14                   NA         NA      NA       NA    
Liaoning        -1.00822    0.06684  -15.08   <2e-16 ***
Guangdong       -0.81053    0.06718  -


Call:
lm(formula = lnrinc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + 
    Others, data = female_urban_reg)

Residuals:
     Min       1Q   Median       3Q      Max 
-10.4059  -0.3756   0.1628   0.6242   4.0815 

Coefficients: (1 not defined because of singularities)
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -1.03674    0.10115 -10.249  < 2e-16 ***
postsecondary    1.23359    0.04293  28.737  < 2e-16 ***
seniorsecondary  0.79798    0.04146  19.248  < 2e-16 ***
juniorsecondary  0.75727    0.03905  19.390  < 2e-16 ***
primary          0.37326    0.04415   8.454  < 2e-16 ***
y10              0.35573    0.09204   3.865 0.000112 ***
y12              0.69337    0.09247   7.498 6.99e-14 ***
y14                   NA         NA      NA       NA    
Liaoning        -0.43334    0.04340  -9.985  < 2e-16 ***
Guangdong       -0.37074    0.04243  -8.738  < 2e-16 ***
Gansu           -0.44016    0.


Call:
lm(formula = lnrinc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + 
    Others, data = female_rural_reg)

Residuals:
    Min      1Q  Median      3Q     Max 
-9.6181 -0.5310  0.2357  0.8558  4.6663 

Coefficients: (1 not defined because of singularities)
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -1.18521    0.13522  -8.765  < 2e-16 ***
postsecondary    1.15510    0.06571  17.580  < 2e-16 ***
seniorsecondary  0.96420    0.04488  21.484  < 2e-16 ***
juniorsecondary  0.86766    0.03454  25.120  < 2e-16 ***
primary          0.47848    0.03634  13.165  < 2e-16 ***
y10              0.85114    0.11504   7.398 1.47e-13 ***
y12              1.41806    0.11577  12.249  < 2e-16 ***
y14                   NA         NA      NA       NA    
Liaoning        -0.77162    0.08131  -9.490  < 2e-16 ***
Guangdong       -0.73784    0.08229  -8.967  < 2e-16 ***
Gansu           -0.96989    0.07490 -12.


Call:
lm(formula = lnrinc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + 
    Others, data = male_urban_reg)

Residuals:
     Min       1Q   Median       3Q      Max 
-10.1624  -0.3962   0.2283   0.7312   4.0713 

Coefficients: (1 not defined because of singularities)
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -1.43022    0.13120 -10.901  < 2e-16 ***
postsecondary    1.65049    0.04742  34.804  < 2e-16 ***
seniorsecondary  1.20218    0.04556  26.390  < 2e-16 ***
juniorsecondary  0.91600    0.04207  21.771  < 2e-16 ***
primary          0.62451    0.04986  12.525  < 2e-16 ***
y10              0.72861    0.12656   5.757 8.82e-09 ***
y12              1.22899    0.12764   9.628  < 2e-16 ***
y14                   NA         NA      NA       NA    
Liaoning        -0.47487    0.05395  -8.803  < 2e-16 ***
Guangdong       -0.43902    0.05322  -8.250  < 2e-16 ***
Gansu           -0.81752    0.07


Call:
lm(formula = lnrinc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + 
    Others, data = male_rural_reg)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.9969 -0.7690  0.2868  1.0269  5.2679 

Coefficients: (1 not defined because of singularities)
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -1.16401    0.16854  -6.906 5.33e-12 ***
postsecondary    1.48642    0.09477  15.684  < 2e-16 ***
seniorsecondary  1.01037    0.06979  14.478  < 2e-16 ***
juniorsecondary  0.89643    0.04651  19.273  < 2e-16 ***
primary          0.52805    0.04683  11.277  < 2e-16 ***
y10              1.34478    0.14275   9.420  < 2e-16 ***
y12              2.13629    0.14555  14.677  < 2e-16 ***
y14                   NA         NA      NA       NA    
Liaoning        -1.32110    0.10912 -12.106  < 2e-16 ***
Guangdong       -0.97815    0.10910  -8.966  < 2e-16 ***
Gansu           -1.49325    0.09913 -15.06

### Regression for pinc and lnpinc, without province dummies

In [None]:
summary(lm(pinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + urban + y10 + y12 + y14, data=combined))
summary(lm(pinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + y10 + y12 + y14, data=urban))
summary(lm(pinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + y10 + y12 + y14, data=rural))

lnpincReg <- filter(combined, !is.infinite(lnpinc) & !is.na(lnpinc))
urban_reg <- filter(urban, !is.infinite(lnpinc) & !is.na(lnpinc))
rural_reg <- filter(rural, !is.infinite(lnpinc) & !is.na(lnpinc))
summary(lm(lnpinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + urban + y10 + y12 + y14, data=lnpincReg))
summary(lm(lnpinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + y10 + y12 + y14, data=urban_reg))
summary(lm(lnpinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + y10 + y12 + y14, data=rural_reg))


female_urban_reg <- filter(female_urban, !is.infinite(lnpinc) & !is.na(lnpinc))
female_rural_reg <- filter(female_rural, !is.infinite(lnpinc) & !is.na(lnpinc))
male_urban_reg <- filter(male_urban, !is.infinite(lnpinc) & !is.na(lnpinc))
male_rural_reg <- filter(male_rural, !is.infinite(lnpinc) & !is.na(lnpinc))
summary(lm(lnpinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14, data=female_urban_reg))
summary(lm(lnpinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14, data=female_rural_reg))
summary(lm(lnpinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14, data=male_urban_reg))
summary(lm(lnpinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14, data=male_rural_reg))

In [34]:
index <- c("combined", "urban", "rural", "female_urban", "female_rural", "male_urban", "male_rural")
mean_lnpinc <- c(mean(lnpincReg$lnpinc), mean(urban_reg$lnpinc), mean(rural_reg$lnpinc), mean(female_urban_reg$lnpinc), mean(female_rural_reg$lnpinc), mean(male_urban_reg$lnpinc), mean(male_rural_reg$lnpinc))
median_lnpinc <- c(median(lnpincReg$lnpinc), median(urban_reg$lnpinc), median(rural_reg$lnpinc), median(female_urban_reg$lnpinc), median(female_rural_reg$lnpinc), median(male_urban_reg$lnpinc), median(male_rural_reg$lnpinc))
var_lnpinc <- c(var(lnpincReg$lnpinc), var(urban_reg$lnpinc), var(rural_reg$lnpinc), var(female_urban_reg$lnpinc), var(female_rural_reg$lnpinc), var(male_urban_reg$lnpinc), var(male_rural_reg$lnpinc))
kurtosis_lnpinc <- c(kurtosis(lnpincReg$lnpinc), kurtosis(urban_reg$lnpinc), kurtosis(rural_reg$lnpinc), kurtosis(female_urban_reg$lnpinc), kurtosis(female_rural_reg$lnpinc), kurtosis(male_urban_reg$lnpinc), kurtosis(male_rural_reg$lnpinc))
skew_lnpinc <- c(skewness(lnpincReg$lnpinc), skewness(urban_reg$lnpinc), skewness(rural_reg$lnpinc), skewness(female_urban_reg$lnpinc), skewness(female_rural_reg$lnpinc), skewness(male_urban_reg$lnpinc), skewness(male_rural_reg$lnpinc))

lnpinc <- data.frame("subsamples"=index, "mean" = mean_lnpinc, "median" = median_lnpinc, "var" = var_lnpinc, "kurtosis" = kurtosis_lnpinc, "skew" = skew_lnpinc)
lnpinc$stdev <- sqrt(lnpinc$var)

In [35]:
print("Lnpinc descriptive statistics")
print(lnpinc)

[1] "Lnpinc descriptive statistics"
    subsamples       mean     median       var kurtosis      skew     stdev
1     combined -0.4929644 -0.4137467 0.1729911 3.460895 -1.504673 0.4159220
2        urban -0.5239991 -0.4371708 0.1657071 1.414272 -1.098564 0.4070714
3        rural -0.4672305 -0.3412846 0.1631424 1.812138 -1.317396 0.4039089
4 female_urban -0.6097785 -0.4938689 0.2372013 1.098065 -1.111018 0.4870332
5 female_rural -0.5606974 -0.4242731 0.2443960 1.507585 -1.342581 0.4943642
6   male_urban -0.4587463 -0.3767322 0.1266873 2.115663 -1.204974 0.3559316
7   male_rural -0.4020730 -0.2807868 0.1244622 2.958936 -1.463058 0.3527921


### Regression for pinc and lnpinc, with province dummies

In [49]:
summary(lm(pinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + urban + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=combined))
summary(lm(pinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=urban))
summary(lm(pinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=rural))

summary(lm(lnpinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + urban + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=lnpincReg))
summary(lm(lnpinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=urban_reg))
summary(lm(lnpinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + gender + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=rural_reg))

summary(lm(lnpinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=female_urban_reg))
summary(lm(lnpinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=female_rural_reg))
summary(lm(lnpinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=male_urban_reg))
summary(lm(lnpinc ~  postsecondary + seniorsecondary + juniorsecondary + primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + Others, data=male_rural_reg))


Call:
lm(formula = pinc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + gender + urban + y10 + y12 + y14 + Liaoning + Guangdong + 
    Gansu + Others, data = combined)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.90849 -0.28018  0.01455  0.27072  1.17398 

Coefficients: (1 not defined because of singularities)
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      0.473133   0.010103  46.831  < 2e-16 ***
postsecondary    0.403835   0.005019  80.455  < 2e-16 ***
seniorsecondary  0.258196   0.004209  61.346  < 2e-16 ***
juniorsecondary  0.201110   0.003430  58.626  < 2e-16 ***
primary          0.109040   0.003763  28.977  < 2e-16 ***
gender           0.059591   0.001683  35.418  < 2e-16 ***
urban            0.003772   0.001129   3.342 0.000833 ***
y10             -0.031842   0.009217  -3.455 0.000551 ***
y12             -0.164484   0.009146 -17.984  < 2e-16 ***
y14                    NA         NA      NA       NA    
Liaoning    


Call:
lm(formula = pinc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + gender + y10 + y12 + y14 + Liaoning + Guangdong + 
    Gansu + Others, data = urban)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.83406 -0.26185  0.04785  0.24975  1.11912 

Coefficients: (1 not defined because of singularities)
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      0.456531   0.015775  28.940  < 2e-16 ***
postsecondary    0.381754   0.006555  58.236  < 2e-16 ***
seniorsecondary  0.231332   0.006050  38.235  < 2e-16 ***
juniorsecondary  0.173988   0.005503  31.619  < 2e-16 ***
primary          0.093672   0.006384  14.674  < 2e-16 ***
gender           0.066484   0.002620  25.378  < 2e-16 ***
y10             -0.070708   0.014860  -4.758 1.96e-06 ***
y12             -0.143116   0.014880  -9.618  < 2e-16 ***
y14                    NA         NA      NA       NA    
Liaoning        -0.120572   0.007445 -16.195  < 2e-16 ***
Guangdong       -0.0986


Call:
lm(formula = pinc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + gender + y10 + y12 + y14 + Liaoning + Guangdong + 
    Gansu + Others, data = rural)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.80944 -0.30993 -0.05808  0.29733  1.21931 

Coefficients: (1 not defined because of singularities)
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      0.543362   0.018222  29.818  < 2e-16 ***
postsecondary    0.375960   0.010315  36.449  < 2e-16 ***
seniorsecondary  0.246103   0.006650  37.008  < 2e-16 ***
juniorsecondary  0.212295   0.004642  45.736  < 2e-16 ***
primary          0.121855   0.004808  25.345  < 2e-16 ***
gender           0.063398   0.002295  27.621  < 2e-16 ***
y10             -0.043428   0.014829  -2.929  0.00341 ** 
y12             -0.233790   0.014833 -15.761  < 2e-16 ***
y14                    NA         NA      NA       NA    
Liaoning        -0.218081   0.012396 -17.593  < 2e-16 ***
Guangdong       -0.1547


Call:
lm(formula = lnpinc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + gender + urban + y10 + y12 + y14 + Liaoning + Guangdong + 
    Gansu + Others, data = lnpincReg)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.07991 -0.18090  0.03977  0.22536  1.09862 

Coefficients: (1 not defined because of singularities)
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -1.053580   0.011545 -91.259  < 2e-16 ***
postsecondary    0.471706   0.006460  73.016  < 2e-16 ***
seniorsecondary  0.362452   0.005791  62.586  < 2e-16 ***
juniorsecondary  0.296784   0.004907  60.483  < 2e-16 ***
primary          0.183668   0.005398  34.028  < 2e-16 ***
gender           0.069202   0.002449  28.253  < 2e-16 ***
urban           -0.010123   0.001357  -7.462 8.67e-14 ***
y10              0.378317   0.010199  37.095  < 2e-16 ***
y12              0.623118   0.010325  60.353  < 2e-16 ***
y14                    NA         NA      NA       NA    
Liaoning 


Call:
lm(formula = lnpinc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + gender + y10 + y12 + y14 + Liaoning + Guangdong + 
    Gansu + Others, data = urban_reg)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.0862 -0.1895  0.0467  0.2424  1.0930 

Coefficients: (1 not defined because of singularities)
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -1.010056   0.017963  -56.23   <2e-16 ***
postsecondary    0.493427   0.008859   55.70   <2e-16 ***
seniorsecondary  0.350724   0.008520   41.16   <2e-16 ***
juniorsecondary  0.282232   0.007923   35.62   <2e-16 ***
primary          0.170295   0.009171   18.57   <2e-16 ***
gender           0.078529   0.003567   22.02   <2e-16 ***
y10              0.236962   0.016508   14.35   <2e-16 ***
y12              0.448045   0.016725   26.79   <2e-16 ***
y14                    NA         NA      NA       NA    
Liaoning        -0.147914   0.009638  -15.35   <2e-16 ***
Guangdong       -0.123338  


Call:
lm(formula = lnpinc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + gender + y10 + y12 + y14 + Liaoning + Guangdong + 
    Gansu + Others, data = rural_reg)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.76170 -0.17346  0.03326  0.22645  0.96157 

Coefficients: (1 not defined because of singularities)
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -0.867996   0.020031  -43.33   <2e-16 ***
postsecondary    0.332380   0.012893   25.78   <2e-16 ***
seniorsecondary  0.284251   0.008926   31.85   <2e-16 ***
juniorsecondary  0.249788   0.006471   38.60   <2e-16 ***
primary          0.170082   0.006701   25.38   <2e-16 ***
gender           0.076985   0.003506   21.96   <2e-16 ***
y10              0.298826   0.015232   19.62   <2e-16 ***
y12              0.600862   0.015633   38.44   <2e-16 ***
y14                    NA         NA      NA       NA    
Liaoning        -0.229685   0.015568  -14.75   <2e-16 ***
Guangdong       -


Call:
lm(formula = lnpinc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + 
    Others, data = female_urban_reg)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.48398 -0.24291  0.07019  0.30691  1.07589 

Coefficients: (1 not defined because of singularities)
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -1.06062    0.03250 -32.632  < 2e-16 ***
postsecondary    0.58720    0.01655  35.487  < 2e-16 ***
seniorsecondary  0.39885    0.01598  24.959  < 2e-16 ***
juniorsecondary  0.34687    0.01503  23.079  < 2e-16 ***
primary          0.18354    0.01699  10.803  < 2e-16 ***
y10              0.16419    0.02885   5.692 1.29e-08 ***
y12              0.42792    0.02912  14.697  < 2e-16 ***
y14                   NA         NA      NA       NA    
Liaoning        -0.17175    0.01682 -10.212  < 2e-16 ***
Guangdong       -0.14248    0.01646  -8.657  < 2e-16 ***
Gansu           -0.16794    0.


Call:
lm(formula = lnpinc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + 
    Others, data = female_rural_reg)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.17548 -0.22610  0.05625  0.30136  1.06539 

Coefficients: (1 not defined because of singularities)
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -0.89795    0.03686 -24.358  < 2e-16 ***
postsecondary    0.42316    0.02173  19.477  < 2e-16 ***
seniorsecondary  0.37208    0.01485  25.062  < 2e-16 ***
juniorsecondary  0.32612    0.01140  28.603  < 2e-16 ***
primary          0.20651    0.01199  17.231  < 2e-16 ***
y10              0.19813    0.02873   6.897 5.59e-12 ***
y12              0.55602    0.02915  19.073  < 2e-16 ***
y14                   NA         NA      NA       NA    
Liaoning        -0.24868    0.02689  -9.247  < 2e-16 ***
Guangdong       -0.23625    0.02724  -8.673  < 2e-16 ***
Gansu           -0.30683    0.


Call:
lm(formula = lnpinc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + 
    Others, data = male_urban_reg)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.7995 -0.1558  0.0354  0.1928  0.9621 

Coefficients: (1 not defined because of singularities)
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -0.887099   0.021620 -41.031   <2e-16 ***
postsecondary    0.421810   0.010489  40.216   <2e-16 ***
seniorsecondary  0.307508   0.010078  30.514   <2e-16 ***
juniorsecondary  0.216790   0.009286  23.347   <2e-16 ***
primary          0.140590   0.010968  12.818   <2e-16 ***
y10              0.286572   0.020428  14.028   <2e-16 ***
y12              0.459539   0.020797  22.097   <2e-16 ***
y14                    NA         NA      NA       NA    
Liaoning        -0.121500   0.011962 -10.157   <2e-16 ***
Guangdong       -0.115989   0.011790  -9.838   <2e-16 ***
Gansu           -0.187864   0.0


Call:
lm(formula = lnpinc ~ postsecondary + seniorsecondary + juniorsecondary + 
    primary + y10 + y12 + y14 + Liaoning + Guangdong + Gansu + 
    Others, data = male_rural_reg)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.45388 -0.16688  0.02253  0.20410  0.92736 

Coefficients: (1 not defined because of singularities)
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -0.765908   0.024613 -31.118   <2e-16 ***
postsecondary    0.252455   0.017680  14.279   <2e-16 ***
seniorsecondary  0.194298   0.013003  14.942   <2e-16 ***
juniorsecondary  0.174013   0.008654  20.107   <2e-16 ***
primary          0.123057   0.008682  14.173   <2e-16 ***
y10              0.392415   0.018202  21.560   <2e-16 ***
y12              0.670530   0.019075  35.152   <2e-16 ***
y14                    NA         NA      NA       NA    
Liaoning        -0.221425   0.020377 -10.866   <2e-16 ***
Guangdong       -0.174868   0.020311  -8.609   <2e-16 ***
Gansu           -0.26